# üéµ Hindi Audio ‚Üí Text ‚Üí NER (XLM-RoBERTa) Pipeline

This notebook automatically:
- Uses an existing transcription TXT file (if present) OR transcribes a provided audio file
- Loads your pre-trained XLM-RoBERTa model from `XLM_ROBERTA` folder
- Runs NER on the text and displays tagged output and structured entities

## üöÄ **NEW: Professional Gradio UI Available!**

**Scroll to the last cell to use the advanced Gradio web interface!**

The Gradio UI provides:
- ‚ú® Beautiful, professional design with custom styling
- üé® Color-coded entity visualization
- üìä Interactive tables and statistics
- üéØ Easy audio upload and processing
- üì± Responsive design

**For Google Colab:**
1. Run all cells in order
2. The Gradio interface will launch automatically
3. You'll get a public URL to share or use

Place your files in the same folder as this notebook:
- `XLM_ROBERTA` folder (required; contains the trained XLM-RoBERTa model)
- `final_transcription.txt` (optional; used if present)
- Or provide an audio file path when prompted


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Install dependencies
%pip -q install pandas numpy joblib scikit-learn pydub faster-whisper ipywidgets openpyxl transformers torch stanza

import os
from pathlib import Path
import re
import unicodedata
from typing import Dict, List, Tuple, Optional

import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification

from pydub import AudioSegment
from pydub.silence import split_on_silence
from faster_whisper import WhisperModel

from IPython.display import display

# Import stanza for POS tagging
import stanza



[?25l   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m0.0/1.1 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[91m‚ï∏[0m [32m1.1/1.1 MB[0m [31m44.7 MB/s[0m eta [36m0:00:01[0m[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m1.1/1.1 MB[0m [31m29.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m1.7/1.7 MB[0m [31m78.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m40.5/40.5 MB[0m [31m30.4 MB/s[0m eta [36m0:00:00[0m
[2K   [9

  m = re.match('([su]([0-9]{1,2})p?) \(([0-9]{1,2}) bit\)$', token)
  m2 = re.match('([su]([0-9]{1,2})p?)( \(default\))?$', token)
  elif re.match('(flt)p?( \(default\))?$', token):
  elif re.match('(dbl)p?( \(default\))?$', token):


In [None]:
# Helper: Simple Hindi spell checker
class HindiSpellChecker:
    def __init__(self):
        self.hindi_common_words = self._load_hindi_common_words()
        self.common_corrections = self._load_common_corrections()

    def _load_common_corrections(self) -> Dict[str, str]:
        return {"‡§≠‡§ø‡§π‡§æ‡§∞": "‡§¨‡§ø‡§π‡§æ‡§∞", "‡§≠‡§ø‡§π‡§æ‡§∞‡•Ä": "‡§¨‡§ø‡§π‡§æ‡§∞‡•Ä", "‡§á‡§Ç‡§°‡§ø‡§Ø‡§æ": "‡§≠‡§æ‡§∞‡§§"}

    def _load_hindi_common_words(self) -> set:
        return {"‡§π‡•à", "‡§π‡•à‡§Ç", "‡§•‡§æ", "‡§•‡•á", "‡§π‡•ã‡§ó‡§æ", "‡§π‡•ã‡§ó‡•Ä", "‡§π‡•ã‡§Ç‡§ó‡•á", "‡§π‡•Ç‡§Å", "‡§π‡•à‡§Ç", "‡§π‡•ã",
            "‡§Æ‡•à‡§Ç", "‡§§‡•Å‡§Æ", "‡§Ü‡§™", "‡§µ‡§π", "‡§π‡§Æ", "‡§µ‡•á", "‡§Ø‡§π", "‡§Ø‡•á", "‡§µ‡•ã", "‡§ï‡•ã‡§à",
            "‡§ï‡•Å‡§õ", "‡§∏‡§¨", "‡§∏‡§≠‡•Ä", "‡§π‡§∞", "‡§™‡•ç‡§∞‡§§‡•ç‡§Ø‡•á‡§ï", "‡§¶‡•ã‡§®‡•ã‡§Ç", "‡§ï‡•ã‡§à", "‡§ï‡•Å‡§õ",
            "‡§Ö‡§ö‡•ç‡§õ‡§æ", "‡§¨‡•Å‡§∞‡§æ", "‡§¨‡§°‡§º‡§æ", "‡§õ‡•ã‡§ü‡§æ", "‡§®‡§Ø‡§æ", "‡§™‡•Å‡§∞‡§æ‡§®‡§æ", "‡§Ö‡§ö‡•ç‡§õ‡•Ä", "‡§¨‡•Å‡§∞‡•Ä",
            "‡§Ü‡§ú", "‡§ï‡§≤", "‡§™‡§∞‡§∏‡•ã‡§Ç", "‡§∏‡•Å‡§¨‡§π", "‡§∂‡§æ‡§Æ", "‡§∞‡§æ‡§§", "‡§¶‡§ø‡§®", "‡§∏‡§Æ‡§Ø",
            "‡§™‡§æ‡§®‡•Ä", "‡§ñ‡§æ‡§®‡§æ", "‡§ò‡§∞", "‡§ó‡§æ‡§°‡§º‡•Ä", "‡§ï‡§ø‡§§‡§æ‡§¨", "‡§∏‡•ç‡§ï‡•Ç‡§≤", "‡§ï‡§æ‡§Æ", "‡§¶‡•ã‡§∏‡•ç‡§§",
            "‡§Æ‡§æ‡§Å", "‡§™‡§ø‡§§‡§æ", "‡§≠‡§æ‡§à", "‡§¨‡§π‡§®", "‡§¨‡•á‡§ü‡§æ", "‡§¨‡•á‡§ü‡•Ä", "‡§™‡§§‡•ç‡§®‡•Ä", "‡§™‡§§‡§ø",
            "‡§®‡§æ‡§Æ", "‡§â‡§Æ‡•ç‡§∞", "‡§´‡•ã‡§®", "‡§™‡§§‡§æ", "‡§∂‡§π‡§∞", "‡§¶‡•á‡§∂", "‡§≠‡§æ‡§∑‡§æ", "‡§π‡§ø‡§Ç‡§¶‡•Ä",
            "‡§¨‡•ã‡§≤‡§®‡§æ", "‡§∏‡•Å‡§®‡§®‡§æ", "‡§¶‡•á‡§ñ‡§®‡§æ", "‡§Ü‡§®‡§æ", "‡§ú‡§æ‡§®‡§æ", "‡§ñ‡§æ‡§®‡§æ", "‡§™‡•Ä‡§®‡§æ", "‡§∏‡•ã‡§®‡§æ",
            "‡§ö‡§≤‡§®‡§æ", "‡§¶‡•å‡§°‡§º‡§®‡§æ", "‡§¨‡•à‡§†‡§®‡§æ", "‡§ñ‡§°‡§º‡§æ", "‡§ñ‡•ã‡§≤‡§®‡§æ", "‡§¨‡§Ç‡§¶", "‡§™‡§¢‡§º‡§®‡§æ", "‡§≤‡§ø‡§ñ‡§®‡§æ",
            "‡§∏‡•Ä‡§ñ‡§®‡§æ", "‡§™‡§¢‡§º‡§æ‡§®‡§æ", "‡§Æ‡§¶‡§¶", "‡§∏‡§Æ‡§∏‡•ç‡§Ø‡§æ", "‡§∏‡§Æ‡§æ‡§ß‡§æ‡§®", "‡§∏‡§µ‡§æ‡§≤", "‡§ú‡§µ‡§æ‡§¨",
            "‡§∏‡§π‡•Ä", "‡§ó‡§≤‡§§", "‡§∏‡§ö", "‡§ù‡•Ç‡§†", "‡§∏‡•Å‡§Ç‡§¶‡§∞", "‡§¨‡§¶‡§∏‡•Ç‡§∞‡§§", "‡§ñ‡•Å‡§∂", "‡§â‡§¶‡§æ‡§∏",
            "‡§ó‡•Å‡§∏‡•ç‡§∏‡§æ", "‡§•‡§ï‡§æ", "‡§¨‡•Ä‡§Æ‡§æ‡§∞", "‡§∏‡•ç‡§µ‡§∏‡•ç‡§•", "‡§ó‡§∞‡•ç‡§Æ", "‡§†‡§Ç‡§°‡§æ", "‡§Æ‡•Ä‡§†‡§æ", "‡§ñ‡§ü‡•ç‡§ü‡§æ",
            "‡§§‡•Ä‡§ñ‡§æ", "‡§®‡§Æ‡§ï‡•Ä‡§®", "‡§ï‡§°‡§º‡§µ‡§æ", "‡§§‡•á‡§ú", "‡§ß‡•Ä‡§Æ‡§æ", "‡§Ü‡§∏‡§æ‡§®", "‡§Æ‡•Å‡§∂‡•ç‡§ï‡§ø‡§≤",
            "‡§Æ‡§π‡§§‡•ç‡§µ‡§™‡•Ç‡§∞‡•ç‡§£", "‡§ú‡§∞‡•Ç‡§∞‡•Ä", "‡§∏‡§Ç‡§≠‡§µ", "‡§Ö‡§∏‡§Ç‡§≠‡§µ", "‡§Æ‡•Å‡§´‡•ç‡§§", "‡§Æ‡§π‡§Ç‡§ó‡§æ", "‡§∏‡§∏‡•ç‡§§‡§æ",
            "‡§Ö‡§Æ‡•Ä‡§∞", "‡§ó‡§∞‡•Ä‡§¨", "‡§ú‡§µ‡§æ‡§®", "‡§¨‡•Ç‡§¢‡§º‡§æ", "‡§™‡§π‡§≤‡§æ", "‡§Ü‡§ñ‡§ø‡§∞‡•Ä", "‡§Ö‡§ó‡§≤‡§æ", "‡§™‡§ø‡§õ‡§≤‡§æ",
            "‡§Ø‡§π‡§æ‡§Å", "‡§µ‡§π‡§æ‡§Å", "‡§ï‡§π‡§æ‡§Å", "‡§ï‡§¨", "‡§ï‡•ç‡§Ø‡•ã‡§Ç", "‡§ï‡•à‡§∏‡•á", "‡§ï‡•ç‡§Ø‡§æ", "‡§ï‡•å‡§®",
            "‡§Ø‡§π", "‡§µ‡§π", "‡§Ø‡•á", "‡§µ‡•á", "‡§Æ‡•à‡§Ç", "‡§§‡•Å‡§Æ", "‡§µ‡§π", "‡§π‡§Æ", "‡§µ‡•á", "‡§Æ‡•Å‡§ù‡•á",
            "‡§â‡§∏‡•á", "‡§π‡§Æ‡•á‡§Ç", "‡§â‡§®‡•ç‡§π‡•á‡§Ç", "‡§Æ‡•á‡§∞‡§æ", "‡§§‡•Å‡§Æ‡•ç‡§π‡§æ‡§∞‡§æ", "‡§â‡§∏‡§ï‡§æ", "‡§â‡§∏‡§ï‡•Ä", "‡§π‡§Æ‡§æ‡§∞‡§æ",
            "‡§â‡§®‡§ï‡§æ", "‡§î‡§∞", "‡§Ø‡§æ", "‡§≤‡•á‡§ï‡§ø‡§®", "‡§§‡•ã", "‡§ï‡•ç‡§Ø‡•ã‡§Ç‡§ï‡§ø", "‡§Ö‡§ó‡§∞", "‡§µ‡§∞‡§®‡§æ", "‡§≠‡•Ä",
            "‡§¨‡§π‡•Å‡§§", "‡§ï‡§æ‡§´‡•Ä", "‡§¨‡§≤‡•ç‡§ï‡§ø", "‡§¨‡§∏", "‡§ï‡•á‡§µ‡§≤", "‡§Ö‡§≠‡•Ä", "‡§π‡§Æ‡•á‡§∂‡§æ", "‡§ï‡§≠‡•Ä",
            "‡§Ö‡§ï‡•ç‡§∏‡§∞", "‡§Ü‡§Æ‡§§‡•å‡§∞", "‡§≤‡§ó‡§≠‡§ó", "‡§ï‡•á", "‡§¨‡§æ‡§∞‡•á", "‡§Æ‡•á‡§Ç", "‡§∏‡•á", "‡§ï‡•ã", "‡§™‡§∞",
            "‡§¶‡•ç‡§µ‡§æ‡§∞‡§æ", "‡§ï‡§æ", "‡§ï‡•Ä", "‡§ï‡•á", "‡§è‡§ï", "‡§ï‡•Å‡§õ", "‡§∏‡§≠‡•Ä", "‡§π‡§∞", "‡§™‡•ç‡§∞‡§§‡•ç‡§Ø‡•á‡§ï",

            # Indian States and Union Territories
            "‡§¨‡§ø‡§π‡§æ‡§∞", "‡§â‡§§‡•ç‡§§‡§∞‡§™‡•ç‡§∞‡§¶‡•á‡§∂", "‡§Æ‡§π‡§æ‡§∞‡§æ‡§∑‡•ç‡§ü‡•ç‡§∞", "‡§™‡§∂‡•ç‡§ö‡§ø‡§Æ‡§¨‡§Ç‡§ó‡§æ‡§≤", "‡§Æ‡§ß‡•ç‡§Ø‡§™‡•ç‡§∞‡§¶‡•á‡§∂",
            "‡§§‡§Æ‡§ø‡§≤‡§®‡§æ‡§°‡•Å", "‡§∞‡§æ‡§ú‡§∏‡•ç‡§•‡§æ‡§®", "‡§ï‡§∞‡•ç‡§®‡§æ‡§ü‡§ï", "‡§ó‡•Å‡§ú‡§∞‡§æ‡§§", "‡§Ü‡§Ç‡§ß‡•ç‡§∞‡§™‡•ç‡§∞‡§¶‡•á‡§∂",
            "‡§ì‡§°‡§ø‡§∂‡§æ", "‡§§‡•á‡§≤‡§Ç‡§ó‡§æ‡§®‡§æ", "‡§ï‡•á‡§∞‡§≤", "‡§ù‡§æ‡§∞‡§ñ‡§Ç‡§°", "‡§Ö‡§∏‡§Æ", "‡§™‡§Ç‡§ú‡§æ‡§¨", "‡§π‡§∞‡§ø‡§Ø‡§æ‡§£‡§æ",
            "‡§õ‡§§‡•ç‡§§‡•Ä‡§∏‡§ó‡§¢‡§º", "‡§¶‡§ø‡§≤‡•ç‡§≤‡•Ä", "‡§ú‡§Æ‡•ç‡§Æ‡•Ç‡§ï‡§∂‡•ç‡§Æ‡•Ä‡§∞", "‡§â‡§§‡•ç‡§§‡§∞‡§æ‡§ñ‡§Ç‡§°", "‡§π‡§ø‡§Æ‡§æ‡§ö‡§≤‡§™‡•ç‡§∞‡§¶‡•á‡§∂",
            "‡§§‡•ç‡§∞‡§ø‡§™‡•Å‡§∞‡§æ", "‡§Æ‡•á‡§ò‡§æ‡§≤‡§Ø", "‡§Æ‡§£‡§ø‡§™‡•Å‡§∞", "‡§®‡§æ‡§ó‡§æ‡§≤‡•à‡§Ç‡§°", "‡§Ö‡§∞‡•Å‡§£‡§æ‡§ö‡§≤‡§™‡•ç‡§∞‡§¶‡•á‡§∂",
            "‡§Æ‡§ø‡§ú‡•ã‡§∞‡§Æ", "‡§∏‡§ø‡§ï‡•ç‡§ï‡§ø‡§Æ", "‡§ó‡•ã‡§µ‡§æ", "‡§≤‡§ï‡•ç‡§∑‡§¶‡•ç‡§µ‡•Ä‡§™", "‡§¶‡§Æ‡§®‡§¶‡•Ä‡§µ", "‡§¶‡§æ‡§¶‡§∞‡§®‡§ó‡§∞‡§π‡§µ‡•á‡§≤‡•Ä",
            "‡§ö‡§Ç‡§°‡•Ä‡§ó‡§¢‡§º", "‡§™‡•Å‡§°‡•Å‡§ö‡•á‡§∞‡•Ä", "‡§Ö‡§Ç‡§°‡§Æ‡§æ‡§®‡§®‡§ø‡§ï‡•ã‡§¨‡§æ‡§∞",

            # Major Cities
            "‡§Æ‡•Å‡§Ç‡§¨‡§à", "‡§¶‡§ø‡§≤‡•ç‡§≤‡•Ä", "‡§¨‡§Ç‡§ó‡§≤‡•å‡§∞", "‡§π‡•à‡§¶‡§∞‡§æ‡§¨‡§æ‡§¶", "‡§Ö‡§π‡§Æ‡§¶‡§æ‡§¨‡§æ‡§¶", "‡§ö‡•á‡§®‡•ç‡§®‡§à",
            "‡§ï‡•ã‡§≤‡§ï‡§æ‡§§‡§æ", "‡§∏‡•Ç‡§∞‡§§", "‡§™‡•Å‡§£‡•á", "‡§ú‡§Ø‡§™‡•Å‡§∞", "‡§≤‡§ñ‡§®‡§ä", "‡§ï‡§æ‡§®‡§™‡•Å‡§∞", "‡§®‡§æ‡§ó‡§™‡•Å‡§∞",
            "‡§á‡§Ç‡§¶‡•å‡§∞", "‡§≠‡•ã‡§™‡§æ‡§≤", "‡§µ‡§ø‡§∂‡§æ‡§ñ‡§æ‡§™‡§§‡•ç‡§§‡§®‡§Æ", "‡§™‡§ü‡§®‡§æ", "‡§µ‡§°‡•ã‡§¶‡§∞‡§æ", "‡§ó‡§æ‡§ú‡§ø‡§Ø‡§æ‡§¨‡§æ‡§¶",
            "‡§≤‡•Å‡§ß‡§ø‡§Ø‡§æ‡§®‡§æ", "‡§Ü‡§ó‡§∞‡§æ", "‡§®‡§æ‡§∂‡§ø‡§ï", "‡§´‡§∞‡•Ä‡§¶‡§æ‡§¨‡§æ‡§¶", "‡§Æ‡•á‡§∞‡§†", "‡§∞‡§æ‡§ú‡§ï‡•ã‡§ü",
            "‡§ï‡§≤‡§ï‡§§‡•ç‡§§‡§æ", "‡§µ‡§æ‡§∞‡§æ‡§£‡§∏‡•Ä", "‡§∏‡•ã‡§≤‡§æ‡§™‡•Å‡§∞", "‡§µ‡§ø‡§ú‡§Ø‡§µ‡§æ‡§°‡§º‡§æ", "‡§ï‡•ã‡§≤‡•ç‡§π‡§æ‡§™‡•Å‡§∞", "‡§Ö‡§Æ‡•É‡§§‡§∏‡§∞",
            "‡§®‡§µ‡•Ä‡§Æ‡•Å‡§Ç‡§¨‡§à", "‡§Ö‡§≤‡•ç‡§≤‡§æ‡§π‡§æ‡§¨‡§æ‡§¶", "‡§∞‡§æ‡§Ç‡§ö‡•Ä", "‡§π‡§æ‡§µ‡§°‡§º‡§æ", "‡§ó‡•ç‡§µ‡§æ‡§≤‡§ø‡§Ø‡§∞", "‡§ú‡§¨‡§≤‡§™‡•Å‡§∞",
            "‡§ï‡•ã‡§Ø‡§Ç‡§¨‡§§‡•Ç‡§∞", "‡§Æ‡§¶‡•Å‡§∞‡•à", "‡§¨‡•á‡§≤‡§ó‡§æ‡§Æ", "‡§Æ‡§Ç‡§ó‡§≤‡•å‡§∞", "‡§§‡§ø‡§∞‡•Å‡§ö‡§ø‡§∞‡§æ‡§™‡§≤‡•ç‡§≤‡•Ä", "‡§ï‡•ã‡§ö‡•ç‡§ö‡§ø",
            "‡§≠‡•Å‡§µ‡§®‡•á‡§∂‡•ç‡§µ‡§∞", "‡§¨‡•Ä‡§ï‡§æ‡§®‡•á‡§∞", "‡§Ö‡§Æ‡§∞‡§æ‡§µ‡§§‡•Ä", "‡§®‡•ã‡§è‡§°‡§æ", "‡§ï‡•ã‡§≤‡•ç‡§π‡§æ‡§™‡•Å‡§∞", "‡§Ö‡§ï‡•ã‡§≤‡§æ",
            "‡§ó‡•Å‡§≤‡§¨‡§∞‡•ç‡§ó‡§æ", "‡§∂‡§ø‡§Æ‡•ã‡§ó‡§æ", "‡§â‡§ú‡•ç‡§ú‡•à‡§®", "‡§≤‡•ã‡§®‡•Ä", "‡§ú‡§æ‡§≤‡§Ç‡§ß‡§∞", "‡§∏‡§æ‡§Ç‡§ó‡§≤‡•Ä",
            "‡§á‡§∞‡•ã‡§°", "‡§Æ‡§Ç‡§ó‡§≤‡•å‡§∞", "‡§¨‡•á‡§≤‡§ó‡§æ‡§Æ", "‡§Ö‡§Ç‡§¨‡§æ‡§≤‡§æ", "‡§Æ‡•Å‡§∞‡§æ‡§¶‡§æ‡§¨‡§æ‡§¶", "‡§Æ‡•à‡§∏‡•Ç‡§∞",
            "‡§§‡§ø‡§∞‡•Å‡§™‡§§‡§ø", "‡§ï‡§æ‡§≤‡•Ä‡§ï‡§ü", "‡§â‡§¶‡§Ø‡§™‡•Å‡§∞", "‡§Æ‡•á‡§∞‡§†", "‡§§‡§ø‡§∞‡•Å‡§µ‡§®‡§Ç‡§§‡§™‡•Å‡§∞‡§Æ", "‡§∏‡§≤‡•á‡§Æ",
            "‡§Æ‡§ø‡§∞‡•ç‡§ú‡§æ‡§™‡•Å‡§∞", "‡§∞‡§æ‡§ú‡§™‡•Å‡§∞‡§∏‡•ã‡§®‡§æ‡§∞‡§™‡•Å‡§∞", "‡§Ö‡§≤‡•Ä‡§ó‡§¢‡§º", "‡§¨‡§∞‡•á‡§≤‡•Ä", "‡§™‡§æ‡§∞‡§∏‡•Ä", "‡§∏‡•ã‡§≤‡§æ‡§™‡•Å‡§∞",
            "‡§Æ‡•Å‡§ú‡§´‡•ç‡§´‡§∞‡§®‡§ó‡§∞", "‡§¨‡§ø‡§π‡§æ‡§∞‡§∂‡§∞‡•Ä‡§´", "‡§Ö‡§π‡§Æ‡§¶‡§®‡§ó‡§∞", "‡§Æ‡§•‡•Å‡§∞‡§æ", "‡§ï‡•ã‡§ü‡§æ", "‡§Ö‡§ú‡§Æ‡•á‡§∞",
            "‡§ï‡§æ‡§≤‡•Ä‡§ï‡§ü", "‡§Ö‡§ï‡•ã‡§≤‡§æ", "‡§ó‡•Å‡§≤‡§¨‡§∞‡•ç‡§ó‡§æ", "‡§ú‡§Æ‡§∂‡•á‡§¶‡§™‡•Å‡§∞", "‡§¨‡•ã‡§ï‡§æ‡§∞‡•ã", "‡§Ö‡§≤‡§µ‡§∞",
            "‡§ó‡•Å‡§Ç‡§ü‡•Ç‡§∞", "‡§≠‡§ø‡§µ‡§Ç‡§°‡•Ä", "‡§∏‡§π‡§æ‡§∞‡§®‡§™‡•Å‡§∞", "‡§ó‡•ã‡§∞‡§ñ‡§™‡•Å‡§∞", "‡§¨‡•Ä‡§ï‡§æ‡§®‡•á‡§∞", "‡§Ö‡§Æ‡§∞‡§æ‡§µ‡§§‡•Ä",
            "‡§®‡•ã‡§è‡§°‡§æ", "‡§ï‡•ã‡§≤‡•ç‡§π‡§æ‡§™‡•Å‡§∞", "‡§Ö‡§ï‡•ã‡§≤‡§æ", "‡§ó‡•Å‡§≤‡§¨‡§∞‡•ç‡§ó‡§æ", "‡§∂‡§ø‡§Æ‡•ã‡§ó‡§æ", "‡§â‡§ú‡•ç‡§ú‡•à‡§®",
            "‡§≤‡•ã‡§®‡•Ä", "‡§ú‡§æ‡§≤‡§Ç‡§ß‡§∞", "‡§∏‡§æ‡§Ç‡§ó‡§≤‡•Ä", "‡§á‡§∞‡•ã‡§°", "‡§Æ‡§Ç‡§ó‡§≤‡•å‡§∞", "‡§¨‡•á‡§≤‡§ó‡§æ‡§Æ", "‡§Ö‡§Ç‡§¨‡§æ‡§≤‡§æ",
            "‡§Æ‡•Å‡§∞‡§æ‡§¶‡§æ‡§¨‡§æ‡§¶", "‡§Æ‡•à‡§∏‡•Ç‡§∞", "‡§§‡§ø‡§∞‡•Å‡§™‡§§‡§ø", "‡§ï‡§æ‡§≤‡•Ä‡§ï‡§ü", "‡§â‡§¶‡§Ø‡§™‡•Å‡§∞", "‡§Æ‡•á‡§∞‡§†",
            "‡§§‡§ø‡§∞‡•Å‡§µ‡§®‡§Ç‡§§‡§™‡•Å‡§∞‡§Æ", "‡§∏‡§≤‡•á‡§Æ", "‡§Æ‡§ø‡§∞‡•ç‡§ú‡§æ‡§™‡•Å‡§∞", "‡§∞‡§æ‡§ú‡§™‡•Å‡§∞‡§∏‡•ã‡§®‡§æ‡§∞‡§™‡•Å‡§∞", "‡§Ö‡§≤‡•Ä‡§ó‡§¢‡§º",
            "‡§¨‡§∞‡•á‡§≤‡•Ä", "‡§™‡§æ‡§∞‡§∏‡•Ä", "‡§∏‡•ã‡§≤‡§æ‡§™‡•Å‡§∞", "‡§Æ‡•Å‡§ú‡§´‡•ç‡§´‡§∞‡§®‡§ó‡§∞", "‡§¨‡§ø‡§π‡§æ‡§∞‡§∂‡§∞‡•Ä‡§´", "‡§Ö‡§π‡§Æ‡§¶‡§®‡§ó‡§∞",
            "‡§Æ‡§•‡•Å‡§∞‡§æ", "‡§ï‡•ã‡§ü‡§æ", "‡§Ö‡§ú‡§Æ‡•á‡§∞", "‡§ï‡§æ‡§≤‡•Ä‡§ï‡§ü", "‡§Ö‡§ï‡•ã‡§≤‡§æ", "‡§ó‡•Å‡§≤‡§¨‡§∞‡•ç‡§ó‡§æ", "‡§ú‡§Æ‡§∂‡•á‡§¶‡§™‡•Å‡§∞",
            "‡§¨‡•ã‡§ï‡§æ‡§∞‡•ã", "‡§Ö‡§≤‡§µ‡§∞", "‡§ó‡•Å‡§Ç‡§ü‡•Ç‡§∞", "‡§≠‡§ø‡§µ‡§Ç‡§°‡•Ä", "‡§∏‡§π‡§æ‡§∞‡§®‡§™‡•Å‡§∞", "‡§ó‡•ã‡§∞‡§ñ‡§™‡•Å‡§∞",

            # Common Hindi proper nouns and corrections
            "‡§≠‡§æ‡§∞‡§§", "‡§π‡§ø‡§Ç‡§¶‡•Å‡§∏‡•ç‡§§‡§æ‡§®", "‡§á‡§Ç‡§°‡§ø‡§Ø‡§æ", "‡§¶‡§ø‡§≤‡•ç‡§≤‡•Ä", "‡§Æ‡•Å‡§Ç‡§¨‡§à", "‡§ï‡•ã‡§≤‡§ï‡§æ‡§§‡§æ",
            "‡§ö‡•á‡§®‡•ç‡§®‡§à", "‡§¨‡§Ç‡§ó‡§≤‡•å‡§∞", "‡§π‡•à‡§¶‡§∞‡§æ‡§¨‡§æ‡§¶", "‡§™‡•Å‡§£‡•á", "‡§Ö‡§π‡§Æ‡§¶‡§æ‡§¨‡§æ‡§¶", "‡§ú‡§Ø‡§™‡•Å‡§∞",
            "‡§≤‡§ñ‡§®‡§ä", "‡§ï‡§æ‡§®‡§™‡•Å‡§∞", "‡§®‡§æ‡§ó‡§™‡•Å‡§∞", "‡§á‡§Ç‡§¶‡•å‡§∞", "‡§≠‡•ã‡§™‡§æ‡§≤", "‡§™‡§ü‡§®‡§æ", "‡§∞‡§æ‡§Ç‡§ö‡•Ä",
            "‡§≠‡•Å‡§µ‡§®‡•á‡§∂‡•ç‡§µ‡§∞", "‡§ó‡•Å‡§µ‡§æ‡§π‡§æ‡§ü‡•Ä", "‡§∂‡§ø‡§Æ‡§≤‡§æ", "‡§ö‡§Ç‡§°‡•Ä‡§ó‡§¢‡§º", "‡§¶‡•á‡§π‡§∞‡§æ‡§¶‡•Ç‡§®", "‡§ó‡§æ‡§Ç‡§ß‡•Ä‡§®‡§ó‡§∞",
            "‡§∞‡§æ‡§Ø‡§™‡•Å‡§∞", "‡§≠‡•Å‡§µ‡§®‡•á‡§∂‡•ç‡§µ‡§∞", "‡§ï‡•ã‡§π‡§ø‡§Æ‡§æ", "‡§á‡§Æ‡•ç‡§´‡§æ‡§≤", "‡§∂‡§ø‡§≤‡§æ‡§Ç‡§ó", "‡§Ü‡§á‡§ú‡•ã‡§≤",
            "‡§ó‡§Ç‡§ó‡§ü‡•ã‡§ï", "‡§™‡§£‡§ú‡•Ä", "‡§ï‡§µ‡§∞‡§§‡•ç‡§§‡•Ä", "‡§¶‡§Æ‡§®", "‡§∏‡§ø‡§≤‡§µ‡§æ‡§∏‡§æ", "‡§¶‡§æ‡§¶‡§∞‡§æ", "‡§™‡•Å‡§°‡•Å‡§ö‡•á‡§∞‡•Ä",
            "‡§™‡•ã‡§∞‡•ç‡§ü‡§¨‡•ç‡§≤‡•á‡§Ø‡§∞", "‡§ï‡§∞‡§µ‡§æ‡§∞", "‡§Æ‡§Ç‡§ó‡§≤‡•å‡§∞", "‡§ï‡•ã‡§ö‡•ç‡§ö‡§ø", "‡§§‡§ø‡§∞‡•Å‡§µ‡§®‡§Ç‡§§‡§™‡•Å‡§∞‡§Æ",
            "‡§Æ‡§¶‡•Å‡§∞‡•à", "‡§ï‡•ã‡§Ø‡§Ç‡§¨‡§§‡•Ç‡§∞", "‡§∏‡§≤‡•á‡§Æ", "‡§§‡§ø‡§∞‡•Å‡§ö‡§ø‡§∞‡§æ‡§™‡§≤‡•ç‡§≤‡•Ä", "‡§µ‡§ø‡§∂‡§æ‡§ñ‡§æ‡§™‡§§‡•ç‡§§‡§®‡§Æ",
            "‡§µ‡§ø‡§ú‡§Ø‡§µ‡§æ‡§°‡§º‡§æ", "‡§ó‡•Å‡§Ç‡§ü‡•Ç‡§∞", "‡§ï‡§æ‡§ï‡§ø‡§®‡§æ‡§°‡§æ", "‡§∞‡§æ‡§ú‡§Æ‡•Å‡§Ç‡§¶‡§∞‡•Ä", "‡§®‡•á‡§≤‡•ç‡§≤‡•ã‡§∞", "‡§ï‡§°‡§™‡§æ",
            "‡§Ö‡§®‡§Ç‡§§‡§™‡•Å‡§∞", "‡§§‡§ø‡§∞‡•Å‡§™‡§§‡§ø", "‡§ö‡§ø‡§§‡•ç‡§§‡•Ç‡§∞", "‡§ï‡•Å‡§∞‡•ç‡§®‡•Ç‡§≤", "‡§®‡§æ‡§Ç‡§¶‡•á‡§°", "‡§≤‡§æ‡§§‡•Ç‡§∞",
            "‡§™‡§∞‡§≠‡§£‡•Ä", "‡§®‡§æ‡§Ç‡§¶‡•Å‡§∞‡§¨‡§æ‡§∞", "‡§ú‡§≤‡§ó‡§æ‡§Å‡§µ", "‡§Ö‡§π‡§Æ‡§¶‡§®‡§ó‡§∞", "‡§®‡§æ‡§∏‡§ø‡§ï", "‡§ß‡•Å‡§≤‡•á",
            "‡§ú‡§≤‡§ó‡§æ‡§Å‡§µ", "‡§®‡§æ‡§Ç‡§¶‡•Å‡§∞‡§¨‡§æ‡§∞", "‡§™‡§∞‡§≠‡§£‡•Ä", "‡§≤‡§æ‡§§‡•Ç‡§∞", "‡§®‡§æ‡§Ç‡§¶‡•á‡§°", "‡§ï‡•Å‡§∞‡•ç‡§®‡•Ç‡§≤",
            "‡§ö‡§ø‡§§‡•ç‡§§‡•Ç‡§∞", "‡§§‡§ø‡§∞‡•Å‡§™‡§§‡§ø", "‡§Ö‡§®‡§Ç‡§§‡§™‡•Å‡§∞", "‡§ï‡§°‡§™‡§æ", "‡§®‡•á‡§≤‡•ç‡§≤‡•ã‡§∞", "‡§∞‡§æ‡§ú‡§Æ‡•Å‡§Ç‡§¶‡§∞‡•Ä",
            "‡§ï‡§æ‡§ï‡§ø‡§®‡§æ‡§°‡§æ", "‡§ó‡•Å‡§Ç‡§ü‡•Ç‡§∞", "‡§µ‡§ø‡§ú‡§Ø‡§µ‡§æ‡§°‡§º‡§æ", "‡§µ‡§ø‡§∂‡§æ‡§ñ‡§æ‡§™‡§§‡•ç‡§§‡§®‡§Æ", "‡§§‡§ø‡§∞‡•Å‡§ö‡§ø‡§∞‡§æ‡§™‡§≤‡•ç‡§≤‡•Ä",
            "‡§∏‡§≤‡•á‡§Æ", "‡§ï‡•ã‡§Ø‡§Ç‡§¨‡§§‡•Ç‡§∞", "‡§Æ‡§¶‡•Å‡§∞‡•à", "‡§§‡§ø‡§∞‡•Å‡§µ‡§®‡§Ç‡§§‡§™‡•Å‡§∞‡§Æ", "‡§ï‡•ã‡§ö‡•ç‡§ö‡§ø", "‡§Æ‡§Ç‡§ó‡§≤‡•å‡§∞",
            "‡§ï‡§∞‡§µ‡§æ‡§∞", "‡§™‡•ã‡§∞‡•ç‡§ü‡§¨‡•ç‡§≤‡•á‡§Ø‡§∞", "‡§™‡•Å‡§°‡•Å‡§ö‡•á‡§∞‡•Ä", "‡§¶‡§æ‡§¶‡§∞‡§æ", "‡§∏‡§ø‡§≤‡§µ‡§æ‡§∏‡§æ", "‡§¶‡§Æ‡§®",
            "‡§ï‡§µ‡§∞‡§§‡•ç‡§§‡•Ä", "‡§™‡§£‡§ú‡•Ä", "‡§ó‡§Ç‡§ó‡§ü‡•ã‡§ï", "‡§Ü‡§á‡§ú‡•ã‡§≤", "‡§∂‡§ø‡§≤‡§æ‡§Ç‡§ó", "‡§á‡§Æ‡•ç‡§´‡§æ‡§≤", "‡§ï‡•ã‡§π‡§ø‡§Æ‡§æ",
            "‡§≠‡•Å‡§µ‡§®‡•á‡§∂‡•ç‡§µ‡§∞", "‡§∞‡§æ‡§Ø‡§™‡•Å‡§∞", "‡§ó‡§æ‡§Ç‡§ß‡•Ä‡§®‡§ó‡§∞", "‡§¶‡•á‡§π‡§∞‡§æ‡§¶‡•Ç‡§®", "‡§ö‡§Ç‡§°‡•Ä‡§ó‡§¢‡§º", "‡§∂‡§ø‡§Æ‡§≤‡§æ",
            "‡§ó‡•Å‡§µ‡§æ‡§π‡§æ‡§ü‡•Ä", "‡§≠‡•Å‡§µ‡§®‡•á‡§∂‡•ç‡§µ‡§∞", "‡§∞‡§æ‡§Ç‡§ö‡•Ä", "‡§™‡§ü‡§®‡§æ", "‡§≠‡•ã‡§™‡§æ‡§≤", "‡§á‡§Ç‡§¶‡•å‡§∞", "‡§®‡§æ‡§ó‡§™‡•Å‡§∞",
            "‡§ï‡§æ‡§®‡§™‡•Å‡§∞", "‡§≤‡§ñ‡§®‡§ä", "‡§ú‡§Ø‡§™‡•Å‡§∞", "‡§Ö‡§π‡§Æ‡§¶‡§æ‡§¨‡§æ‡§¶", "‡§™‡•Å‡§£‡•á", "‡§π‡•à‡§¶‡§∞‡§æ‡§¨‡§æ‡§¶", "‡§¨‡§Ç‡§ó‡§≤‡•å‡§∞",
            "‡§ö‡•á‡§®‡•ç‡§®‡§à", "‡§ï‡•ã‡§≤‡§ï‡§æ‡§§‡§æ", "‡§Æ‡•Å‡§Ç‡§¨‡§à", "‡§¶‡§ø‡§≤‡•ç‡§≤‡•Ä", "‡§π‡§ø‡§Ç‡§¶‡•Å‡§∏‡•ç‡§§‡§æ‡§®", "‡§≠‡§æ‡§∞‡§§"}

    def is_hindi_word(self, word: str) -> bool:
        for ch in word:
            if '\u0900' <= ch <= '\u097F':
                return True
        return False

    def clean_word(self, word: str) -> str:
        cleaned = re.sub(r'[^\u0900-\u097F\u0020]', '', word)
        cleaned = unicodedata.normalize('NFC', cleaned)
        return cleaned.strip()

    def check_word(self, word: str) -> Tuple[bool, str]:
        if not word or not self.is_hindi_word(word):
            return True, word
        cleaned = self.clean_word(word)
        if not cleaned:
            return True, word
        if cleaned in self.common_corrections:
            return False, self.common_corrections[cleaned]
        if cleaned in self.hindi_common_words:
            return True, cleaned
        return True, cleaned

    def correct_text(self, text: str) -> str:
        words = text.split()
        out = []
        for w in words:
            _, cw = self.check_word(w)
            out.append(cw)
        return ' '.join(out)

spell_checker = HindiSpellChecker()


In [None]:
# XLM-RoBERTa model loader and inference
class HindiNERModel:
    def __init__(self, model_path: str):
        self.model_path = model_path
        self.model = None
        self.tokenizer = None
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.id2label = None
        self.label2id = None

    def load(self):
        """Load the XLM-RoBERTa model and tokenizer"""
        if self.model is None:
            print(f"üì• Loading XLM-RoBERTa model from {self.model_path}...")
            try:
                self.tokenizer = AutoTokenizer.from_pretrained(self.model_path)
                self.model = AutoModelForTokenClassification.from_pretrained(self.model_path)
                self.model.to(self.device)
                self.model.eval()

                # Get label mappings from model config
                if hasattr(self.model.config, 'id2label'):
                    self.id2label = self.model.config.id2label
                    self.label2id = self.model.config.label2id
                else:
                    # Fallback to default labels if not in config
                    self.id2label = {str(i): f"LABEL_{i}" for i in range(23)}

                print(f"‚úÖ Model loaded successfully on {self.device}")
            except Exception as e:
                print(f"‚ùå Error loading model: {e}")
                raise
        return self.model

    def align_predictions(self, words: List[str], tokenizer, token_ids: List[int],
                         predictions: List[int], label_map: dict, text: str) -> Tuple[List[str], List[dict]]:
        """
        Align token-level predictions back to word-level predictions.
        Handles subword tokenization properly using offset mappings.
        """
        # Get labels for each token (skip [CLS] at index 0)
        token_labels = []
        for pred in predictions:
            label_str = label_map.get(str(pred), "O")
            # Handle "Out of Vocabulary" special case
            if label_str == "Out of Vocabulary":
                label_str = "O"
            token_labels.append(label_str)

        # Get tokenization with offset mapping
        # Try to get encoding without truncation first, but handle if too long
        try:
            encoding = tokenizer(text, return_offsets_mapping=True, add_special_tokens=True,
                                padding=False, truncation=False)
        except Exception:
            # If text is too long, we need to truncate
            encoding = tokenizer(text, return_offsets_mapping=True, add_special_tokens=True,
                                padding=False, truncation=True, max_length=512)

        offsets = encoding['offset_mapping']

        # Try to use word_ids() method if available (most reliable)
        word_ids = None
        try:
            if hasattr(encoding, 'word_ids'):
                word_ids = encoding.word_ids()
        except (AttributeError, TypeError):
            pass

        # If word_ids is available, use it (most accurate)
        if word_ids is not None:
            word_labels = ["O"] * len(words)
            for token_idx, word_id in enumerate(word_ids):
                if word_id is None or token_idx >= len(token_labels):
                    continue
                token_label = token_labels[token_idx]
                if token_label == "O":
                    continue
                # Assign label - prefer B- tags
                if word_labels[word_id] == "O" or token_label.startswith('B-'):
                    word_labels[word_id] = token_label
        else:
            # Fallback: use offset mapping to align tokens to words
            # Build word-to-char mapping
            word_starts = []
            char_pos = 0
            for word in words:
                # Find the actual position of the word in text
                while char_pos < len(text) and text[char_pos] == ' ':
                    char_pos += 1
                word_starts.append(char_pos)
                char_pos += len(word)
                # Skip space after word
                while char_pos < len(text) and text[char_pos] == ' ':
                    char_pos += 1

            # Map tokens to words using character offsets
            word_labels = ["O"] * len(words)

            for token_idx, (start, end) in enumerate(offsets):
                # Skip special tokens ([CLS], [SEP], padding)
                if token_idx == 0 or start is None or end is None or token_idx >= len(token_labels):
                    continue

                token_label = token_labels[token_idx]
                if token_label == "O":
                    continue

                # Find which word this token belongs to
                word_idx = None
                for i, word_start in enumerate(word_starts):
                    word_end = word_start + len(words[i])
                    # Check if token overlaps with word
                    if start < word_end and end > word_start:
                        word_idx = i
                        break

                if word_idx is None:
                    continue

                # Assign label to word - prefer B- tags
                if word_labels[word_idx] == "O":
                    word_labels[word_idx] = token_label
                elif token_label.startswith('B-'):
                    word_labels[word_idx] = token_label

        # Post-process: fix I- tags that should be B- tags
        for i in range(len(word_labels)):
            if word_labels[i].startswith('I-'):
                prev_label = word_labels[i-1] if i > 0 else "O"
                entity_type = word_labels[i].split('-', 1)[1]

                # If previous word is O or different entity, convert I- to B-
                if prev_label == "O" or not prev_label.startswith(('B-', 'I-')):
                    word_labels[i] = 'B-' + entity_type
                elif prev_label.startswith('B-') or prev_label.startswith('I-'):
                    prev_entity_type = prev_label.split('-', 1)[1]
                    if prev_entity_type != entity_type:
                        word_labels[i] = 'B-' + entity_type

        # Group into entities
        entities = []
        current = None
        for i, (w, label) in enumerate(zip(words, word_labels)):
            if label.startswith('B-'):
                if current:
                    entities.append(current)
                current = {'text': w, 'label': label[2:], 'start': i, 'end': i}
            elif label.startswith('I-') and current and label[2:] == current['label']:
                current['text'] += ' ' + w
                current['end'] = i
            else:
                if current:
                    entities.append(current)
                current = None

        if current:
            entities.append(current)

        return word_labels, entities

    def predict(self, text: str):
        """Predict NER tags for input text"""
        self.load()
        words = text.strip().split()
        if not words:
            return [], []

        # Tokenize on words to enable reliable word alignment
        encoding = self.tokenizer(
            words,
            is_split_into_words=True,
            return_tensors="pt",
            padding=False,
            truncation=True,
            max_length=512,
        )

        inputs = {k: v.to(self.device) for k, v in encoding.items() if hasattr(v, 'to')}

        with torch.no_grad():
            outputs = self.model(**inputs)
            token_preds = torch.argmax(outputs.logits, dim=-1)[0].cpu().tolist()

        # Build a robust id->label mapper (handle dict or list)
        def id_to_label_fn(idx: int) -> str:
            if self.id2label is None:
                return f"LABEL_{idx}"
            if isinstance(self.id2label, dict):
                lbl = self.id2label.get(str(idx)) or self.id2label.get(idx)
                return str(lbl) if lbl is not None else f"LABEL_{idx}"
            if isinstance(self.id2label, (list, tuple)) and 0 <= idx < len(self.id2label):
                return str(self.id2label[idx])
            return f"LABEL_{idx}"

        # Initialize word-level labels
        word_labels = ["O"] * len(words)

        # Prefer tokenizer's word_ids mapping
        word_ids = None
        try:
            if hasattr(encoding, 'word_ids'):
                word_ids = encoding.word_ids()
        except Exception:
            word_ids = None

        if word_ids is not None:
            for tok_idx, word_id in enumerate(word_ids):
                if word_id is None or tok_idx >= len(token_preds):
                    continue
                label_str = id_to_label_fn(token_preds[tok_idx])
                if label_str == "Out of Vocabulary":
                    label_str = "O"
                if label_str != "O":
                    if word_labels[word_id] == "O" or label_str.startswith("B-"):
                        word_labels[word_id] = label_str
        else:
            # Fallback: naive assignment by distributing non-O labels to nearest words
            wi = 0
            for tok_idx, _ in enumerate(token_preds):
                if wi >= len(words):
                    break
                label_str = id_to_label_fn(token_preds[tok_idx])
                if label_str == "Out of Vocabulary":
                    label_str = "O"
                if label_str != "O":
                    if word_labels[wi] == "O" or label_str.startswith("B-"):
                        word_labels[wi] = label_str
                # Advance conservatively
                if label_str != "O" or (tok_idx % 2 == 1):
                    wi = min(wi + 1, len(words) - 1)

        # Post-process I- tags to valid BIO
        for i in range(len(word_labels)):
            if word_labels[i].startswith('I-'):
                prev = word_labels[i-1] if i > 0 else 'O'
                ent = word_labels[i].split('-', 1)[1]
                if prev == 'O' or not prev.endswith(ent):
                    word_labels[i] = 'B-' + ent

        # Build entities from word labels
        entities = []
        current = None
        for i, (w, label) in enumerate(zip(words, word_labels)):
            if label.startswith('B-'):
                if current:
                    entities.append(current)
                current = {'text': w, 'label': label[2:], 'start': i, 'end': i}
            elif label.startswith('I-') and current and label[2:] == current['label']:
                current['text'] += ' ' + w
                current['end'] = i
            else:
                if current:
                    entities.append(current)
                current = None
        if current:
            entities.append(current)

        return word_labels, entities

ner_model = HindiNERModel("/content/drive/MyDrive/XLM_ROBERTA")


In [None]:
# Audio transcription helper
class HindiAudioTranscriber:
    def __init__(self, model_size: str = "medium", enable_spell_check: bool = True):
        self.model_size = model_size
        self.enable_spell_check = enable_spell_check
        self.model = None

    def load_model(self):
        if self.model is None:
            self.model = WhisperModel(self.model_size, device="auto", compute_type="auto")

    def split_audio(self, file_path: Path, min_silence_len=500, silence_thresh=-40, keep_silence=300):
        audio = AudioSegment.from_file(file_path)
        chunks = split_on_silence(
            audio,
            min_silence_len=min_silence_len,
            silence_thresh=audio.dBFS + silence_thresh,
            keep_silence=keep_silence
        )
        return chunks

    def transcribe_chunk(self, chunk) -> str:
        import tempfile, os
        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
            chunk.export(tmp.name, format="wav")
            segments, info = self.model.transcribe(tmp.name, language="hi")
            text = " ".join([seg.text for seg in segments])
            os.unlink(tmp.name)
            return text

    def transcribe(self, file_path: Path) -> str:
        self.load_model()
        chunks = self.split_audio(file_path)
        if not chunks:
            return ""
        parts = []
        for i, ch in enumerate(chunks):
            parts.append(self.transcribe_chunk(ch))
        full_text = " ".join(parts).strip()
        if self.enable_spell_check and full_text:
            return spell_checker.correct_text(full_text)
        return full_text

transcriber = HindiAudioTranscriber()


In [None]:
# Audio input UI: Upload or specify a path, then transcribe and run NER
# Priority: Uploaded audio > Provided path > existing final_transcription.txt

import io
import os
from IPython.display import clear_output
import ipywidgets as widgets

transcription_txt = Path("final_transcription.txt")

# Widgets
uploader = widgets.FileUpload(accept='.wav,.mp3,.m4a,.flac,.ogg', multiple=False)
path_box = widgets.Text(placeholder='path/to/audio.wav or .mp3 ...', description='Audio path:')
run_btn = widgets.Button(description='Transcribe audio', button_style='primary', icon='play')
status_out = widgets.Output()

text = ""

def handle_transcription(_):
    global text
    with status_out:
        clear_output(wait=True)
        chosen_audio_path = None

        # 1) Uploaded file
        if uploader.value:
            # Extract the uploaded file
            [(fname, fileinfo)] = list(uploader.value.items())
            data: bytes = fileinfo['content']
            # Save locally with the original name
            chosen_audio_path = Path(fname)
            with open(chosen_audio_path, 'wb') as f:
                f.write(data)
            print(f"üìÅ Saved uploaded audio to: {chosen_audio_path}")

        # 2) Path typed by user
        elif path_box.value and Path(path_box.value).exists():
            chosen_audio_path = Path(path_box.value)
            print(f"üìù Using provided path: {chosen_audio_path}")

        # 3) Fallback to existing transcription
        if chosen_audio_path is None:
            if transcription_txt.exists():
                print("‚úÖ Found final_transcription.txt ‚Äî using it as input")
                text_local = transcription_txt.read_text(encoding="utf-8").strip()
                if not text_local:
                    print("‚ö†Ô∏è final_transcription.txt is empty.")
                else:
                    text = spell_checker.correct_text(text_local)
                    print("\n=== Transcribed / Input Text (Full) ===\n")
                    print(text)
                return
            else:
                print("‚ùå No audio provided and final_transcription.txt not found.")
                print("‚û°Ô∏è Upload an audio file above or enter a valid audio path.")
                return

        # If we have an audio path, transcribe it
        try:
            print(f"üéµ Transcribing audio: {chosen_audio_path}")
            text_local = transcriber.transcribe(chosen_audio_path)
            if not text_local:
                print("‚ö†Ô∏è Got empty transcription.")
                return
            # Optional spell check pass
            text = spell_checker.correct_text(text_local)
            # Save to final_transcription.txt for reproducibility
            transcription_txt.write_text(text, encoding="utf-8")
            print("üíæ Saved transcription to final_transcription.txt")
            print("\n=== Transcribed / Input Text (Full) ===\n")
            print(text)
        except Exception as e:
            print(f"‚ùå Transcription failed: {e}")

run_btn.on_click(handle_transcription)

ui = widgets.VBox([
    widgets.HTML('<b>Provide audio input</b> (upload or enter a file path)'),
    uploader,
    path_box,
    run_btn,
    status_out
])

display(ui)


VBox(children=(HTML(value='<b>Provide audio input</b> (upload or enter a file path)'), FileUpload(value={}, ac‚Ä¶

In [None]:
# Helpers: chunk text and merge NER results for long inputs
from typing import Optional


def chunk_token_spans(tokens: list[str], chunk_size: int = 120, overlap: int = 20) -> list[tuple[int, int]]:
    if chunk_size <= 0:
        raise ValueError("chunk_size must be > 0")
    if overlap >= chunk_size:
        raise ValueError("overlap must be < chunk_size")
    n = len(tokens)
    if n == 0:
        return []
    spans = []
    step = max(1, chunk_size - overlap)
    start = 0
    while start < n:
        end = min(start + chunk_size, n)
        spans.append((start, end))
        if end == n:
            break
        start += step
    return spans


def merge_adjacent_entities(entities: list[dict]) -> list[dict]:
    if not entities:
        return []
    # sort by start index
    entities_sorted = sorted(entities, key=lambda e: (e['start'], e['end']))
    merged: list[dict] = []
    current: Optional[dict] = None
    for ent in entities_sorted:
        if current is None:
            current = dict(ent)
            continue
        # merge if contiguous/overlapping and same label
        if ent['label'] == current['label'] and ent['start'] <= current['end'] + 1:
            current['text'] = current['text'] + (" " if ent['start'] > current['end'] else "") + ent['text']
            current['end'] = max(current['end'], ent['end'])
        else:
            merged.append(current)
            current = dict(ent)
    if current is not None:
        merged.append(current)
    return merged


def run_ner_in_chunks(text: str, chunk_size: int = 120, overlap: int = 20):
    words = text.strip().split()
    if not words:
        return [], []

    spans = chunk_token_spans(words, chunk_size=chunk_size, overlap=overlap)

    # full per-token labels for entire text
    full_labels: list[Optional[str]] = [None] * len(words)
    all_entities: list[dict] = []

    for (s, e) in spans:
        chunk_text = " ".join(words[s:e])
        preds, entities = ner_model.predict(chunk_text)

        # Fill labels where empty; keep first assignment to avoid conflicts
        for i_rel, tag in enumerate(preds):
            i_abs = s + i_rel
            if i_abs < len(full_labels) and full_labels[i_abs] is None:
                full_labels[i_abs] = tag

        # Offset entities to absolute positions
        for ent in entities:
            all_entities.append({
                'text': ent['text'],
                'label': ent['label'],
                'start': ent['start'] + s,
                'end': ent['end'] + s,
            })

    # Default any remaining Nones to 'O'
    full_labels = [tag if tag is not None else 'O' for tag in full_labels]

    merged_entities = merge_adjacent_entities(all_entities)
    return full_labels, merged_entities


In [None]:
# POS Tagging for OOV words using Stanza
from pathlib import Path

# Initialize Stanza pipeline for Hindi POS tagging
nlp = None
try:
    # Download Hindi models if not already downloaded
    stanza.download('hi')
    # Try CUDA first, fallback to CPU if CUDA fails
    try:
        nlp = stanza.Pipeline('hi', processors='tokenize,pos', use_gpu=True)
        print("‚úÖ Stanza Hindi POS tagging pipeline loaded (GPU)")
    except Exception as gpu_error:
        print(f"‚ö†Ô∏è GPU loading failed: {gpu_error}")
        print("üîÑ Falling back to CPU...")
        nlp = stanza.Pipeline('hi', processors='tokenize,pos', use_gpu=False)
        print("‚úÖ Stanza Hindi POS tagging pipeline loaded (CPU)")
except Exception as e:
    print(f"‚ö†Ô∏è Failed to load Stanza: {e}")
    import traceback
    traceback.print_exc()
    nlp = None


def clean(tok):
    """Remove punctuation from token for matching"""
    return re.sub(r"[^\w\u0900-\u097F]", "", tok)


def labels_to_entities(words: list[str], labels: list[str]) -> list[dict]:
    entities: list[dict] = []
    current = None
    for i, (w, t) in enumerate(zip(words, labels)):
        if t.startswith('B-'):
            if current:
                entities.append(current)
            current = {'text': w, 'label': t[2:], 'start': i, 'end': i}
        elif t.startswith('I-') and current and t[2:] == current['label']:
            current['text'] += ' ' + w
            current['end'] = i
        else:
            if current:
                entities.append(current)
            current = None
    if current:
        entities.append(current)
    return entities


def apply_pos_tagging_to_oov(text: str, preds: list[str], words: list[str]):
    """
    Apply POS tagging to OOV words instead of Excel overrides.
    Checks all OOV tagged words and replaces them with POS tags.
    Matches the logic from the user's provided code.
    """
    if not preds or not words:
        return preds, labels_to_entities(words, preds)

    if nlp is None:
        print("‚ö†Ô∏è Stanza pipeline not available, skipping POS tagging")
        return preds, labels_to_entities(words, preds)

    # Check if any OOV words exist
    has_oov = False
    for tag in preds:
        is_oov = (tag == 'O' or
                  tag == 'Out of Vocabulary' or
                  tag.upper() == 'OUT OF VOCABULARY' or
                  'out of vocabulary' in tag.lower() or
                  tag.upper() == 'OOV')
        if is_oov:
            has_oov = True
            break

    if not has_oov:
        return preds, labels_to_entities(words, preds)

    # Run POS tagging on the full text
    try:
        doc = nlp(text)
        pos_tokens = [clean(w.text) for s in doc.sentences for w in s.words]
        pos_tags = [w.upos for s in doc.sentences for w in s.words]

        # Debug: print first few tokens for alignment check
        if len(pos_tokens) > 0:
            print(f"üîç POS tagging: {len(pos_tokens)} tokens, first 5: {pos_tokens[:5]}")
            print(f"üîç NER words: {len(words)} words, first 5: {[clean(w) for w in words[:5]]}")
    except Exception as e:
        print(f"‚ö†Ô∏è POS tagging failed: {e}")
        import traceback
        traceback.print_exc()
        return preds, labels_to_entities(words, preds)

    # Align POS tags with NER predictions (matching user's code logic)
    new_preds = list(preds)
    pos_i = 0
    matched_count = 0
    oov_count = sum(1 for p in preds if p == 'O')

    for i, (token, ner_label) in enumerate(zip(words, preds)):
        # Check if this word is OOV - match user's exact check
        is_oov = (ner_label == 'O' or
                  ner_label == 'Out of Vocabulary' or
                  ner_label.upper() == 'OUT OF VOCABULARY' or
                  'out of vocabulary' in ner_label.lower() or
                  ner_label.upper() == 'OOV')

        if is_oov:
            # Clean token for matching
            cleaned = clean(token)

            # Advance pos pointer until match found (matching user's code)
            # Start from current pos_i position
            found = False
            search_start = pos_i

            # Strategy 1: Search forward from current position
            while pos_i < len(pos_tokens):
                if pos_tokens[pos_i] == cleaned:
                    # Match found!
                    pos_tag = pos_tags[pos_i]
                    # Check if POS tag is PROPN - tag as B-MISC or I-MISC
                    if pos_tag == "PROPN":
                        # Check previous word to determine B-MISC or I-MISC
                        prev_label = new_preds[i-1] if i > 0 else "O"
                        if prev_label.startswith("I-MISC") or prev_label.startswith("B-MISC"):
                            new_preds[i] = "I-MISC"
                        else:
                            new_preds[i] = "B-MISC"
                    else:
                        new_preds[i] = f"O ({pos_tag})"
                    pos_i += 1
                    matched_count += 1
                    found = True
                    break
                pos_i += 1

            # Strategy 2: If not found forward, search from beginning (but skip already matched)
            if not found:
                # Reset to start but skip tokens we've already checked
                for j in range(len(pos_tokens)):
                    if pos_tokens[j] == cleaned:
                        pos_tag = pos_tags[j]
                        # Check if POS tag is PROPN - tag as B-MISC or I-MISC
                        if pos_tag == "PROPN":
                            # Check previous word to determine B-MISC or I-MISC
                            prev_label = new_preds[i-1] if i > 0 else "O"
                            if prev_label.startswith("I-MISC") or prev_label.startswith("B-MISC"):
                                new_preds[i] = "I-MISC"
                            else:
                                new_preds[i] = "B-MISC"
                        else:
                            new_preds[i] = f"O ({pos_tag})"
                        pos_i = j + 1
                        matched_count += 1
                        found = True
                        break

            # Strategy 3: Fuzzy match - try without diacritics or with normalization
            if not found:
                # Try matching with normalized Unicode
                cleaned_normalized = unicodedata.normalize('NFC', cleaned)
                for j in range(pos_i, len(pos_tokens)):
                    pt_normalized = unicodedata.normalize('NFC', pos_tokens[j])
                    if pt_normalized == cleaned_normalized:
                        pos_tag = pos_tags[j]
                        # Check if POS tag is PROPN - tag as B-MISC or I-MISC
                        if pos_tag == "PROPN":
                            # Check previous word to determine B-MISC or I-MISC
                            prev_label = new_preds[i-1] if i > 0 else "O"
                            if prev_label.startswith("I-MISC") or prev_label.startswith("B-MISC"):
                                new_preds[i] = "I-MISC"
                            else:
                                new_preds[i] = "B-MISC"
                        else:
                            new_preds[i] = f"O ({pos_tag})"
                        pos_i = j + 1
                        matched_count += 1
                        found = True
                        break

            if not found:
                # No match found, keep as O
                new_preds[i] = "O"
        else:
            # Not OOV, keep original label
            pass

    print(f"‚úÖ POS tagged {matched_count} out of {oov_count} OOV words")

    new_entities = labels_to_entities(words, new_preds)
    return new_preds, merge_adjacent_entities(new_entities)



Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.11.0.json:   0%|  ‚Ä¶

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Downloading default packages for language: hi (Hindi) ...


Downloading https://huggingface.co/stanfordnlp/stanza-hi/resolve/v1.11.0/models/default.zip:   0%|          | ‚Ä¶

INFO:stanza:Downloaded file to /root/stanza_resources/hi/default.zip
INFO:stanza:Finished downloading models and saved to /root/stanza_resources
INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.11.0.json:   0%|  ‚Ä¶

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Loading these models for language: hi (Hindi):
| Processor | Package     |
---------------------------
| tokenize  | hdtb        |
| pos       | hdtb_charlm |

INFO:stanza:Using device: cuda
INFO:stanza:Loading: tokenize
INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


‚ö†Ô∏è GPU loading failed: cuDNN version incompatibility: PyTorch was compiled  against (9, 10, 2) but found runtime version (9, 2, 1). PyTorch already comes bundled with cuDNN. One option to resolving this error is to ensure PyTorch can find the bundled cuDNN. one possibility is that there is a conflicting cuDNN in LD_LIBRARY_PATH.
üîÑ Falling back to CPU...


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.11.0.json:   0%|  ‚Ä¶

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Loading these models for language: hi (Hindi):
| Processor | Package     |
---------------------------
| tokenize  | hdtb        |
| pos       | hdtb_charlm |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: pos
INFO:stanza:Done loading processors!


‚úÖ Stanza Hindi POS tagging pipeline loaded (CPU)


In [None]:
# Run NER in chunks and display aggregated results
if text:
    # Tune chunk_size/overlap if needed for your model/context
    chunk_size = 120
    overlap = 20

    # Get original predictions (before POS tagging)
    preds_original, entities_original = run_ner_in_chunks(text, chunk_size=chunk_size, overlap=overlap)

    # Tagged inline output
    words = text.split()

    # Display original NER output
    tagged_original = [f"{w} ({t})" for w, t in zip(words, preds_original)] if preds_original else []

    print("\n=== Original NER Output (Before POS Tagging) ===\n")
    print(" ".join(tagged_original))

    if entities_original:
        print(f"\n=== Original Detected Entities: {len(entities_original)} ===\n")
        entity_df_original = pd.DataFrame(entities_original)
        display(entity_df_original)

    # Apply POS tagging for OOV words (tagged as 'O' or 'Out of Vocabulary')
    preds_corrected = preds_original
    entities_corrected = entities_original

    # Find all OOV words - check for multiple OOV tag formats
    oov_indices = []
    oov_words_list = []
    for i, (w, tag) in enumerate(zip(words, preds_original)):
        is_oov = (tag == 'O' or
                  tag.upper() == 'OUT OF VOCABULARY' or
                  'out of vocabulary' in tag.lower() or
                  tag.upper() == 'OOV')
        if is_oov:
            oov_indices.append(i)
            oov_words_list.append(w)

    if oov_words_list:
        print(f"\n=== OOV Words Found: {len(oov_words_list)} (applying POS tagging) ===\n")
        print(f"Sample OOV words: {oov_words_list[:10]}{'...' if len(oov_words_list) > 10 else ''}\n")

        # Apply POS tagging to OOV words
        preds_corrected, entities_corrected = apply_pos_tagging_to_oov(text, preds_original, words)

        # Track which words were tagged with POS
        pos_tagged_words = []
        for i, (orig_pred, corr_pred) in enumerate(zip(preds_original, preds_corrected)):
            if orig_pred != corr_pred and 'O (' in corr_pred:
                # Extract POS tag from format "O (POS_TAG)"
                pos_tag = corr_pred.split('(')[1].rstrip(')') if '(' in corr_pred else 'N/A'
                pos_tagged_words.append({
                    'word': words[i],
                    'original_tag': orig_pred,
                    'pos_tag': pos_tag
                })

        if pos_tagged_words:
            print(f"‚úÖ Applied POS tags to {len(pos_tagged_words)} OOV words:\n")
            pos_df = pd.DataFrame(pos_tagged_words)
            display(pos_df)
        else:
            print("‚ÑπÔ∏è No OOV words were POS tagged (may be alignment issue).")
    else:
        print("\n‚ÑπÔ∏è No OOV words found in the text.")

    # Display corrected output - extract POS tag from "O (POS_TAG)" format for OOV words
    tagged_corrected = []
    for w, t in zip(words, preds_corrected):
        if t.startswith('O (') and t.endswith(')'):
            # Extract POS tag from "O (POS_TAG)" format
            pos_tag = t[3:-1]  # Remove "O (" and ")"
            tagged_corrected.append(f"{w} ({pos_tag})")
        else:
            tagged_corrected.append(f"{w} ({t})")

    print("\n" + "="*70)
    print("\n=== Corrected NER Output (After POS Tagging) ===\n")
    print(" ".join(tagged_corrected))

    # Structured entities after correction
    if entities_corrected:
        print(f"\n=== Corrected Detected Entities: {len(entities_corrected)} ===\n")
        entity_df_corrected = pd.DataFrame(entities_corrected)
        display(entity_df_corrected)

        # Entity counts after correction
        counts_corrected = entity_df_corrected['label'].value_counts()
        print("\n=== Corrected Entity Counts ===\n")
        display(pd.DataFrame(counts_corrected))
    else:
        print("\n(No entities detected after correction.)")
else:
    print("Skipping NER ‚Äî no text available.")


üì• Loading XLM-RoBERTa model from /content/drive/MyDrive/XLM_ROBERTA...
‚úÖ Model loaded successfully on cuda

=== Original NER Output (Before POS Tagging) ===

‡§∏‡•ã‡§®‡•á (O) ‡§ï‡•Ä (O) ‡§ö‡§ø‡§°‡§ø‡§Ø‡§æ (O) ‡§ï‡§π‡§æ (O) ‡§ú‡§æ‡§®‡•á (O) ‡§µ‡§æ‡§≤‡§æ (O) ‡§≠‡§æ‡§∞‡§§ (B-LOCATION) ‡§¶‡•Å‡§®‡§ø‡§Ø‡§æ (O) ‡§ï‡•á (O) ‡§∏‡§¨‡§∏‡•á (O) ‡§™‡•ç‡§∞‡§æ‡§ö‡§ø‡§® (O) ‡§¶‡•á‡§∂‡•ã‡§Ç (O) ‡§Æ‡•à‡§Ç‡§∏‡•á (O) ‡§è‡§ï (O) ‡§π‡•à (O) ‡§≠‡§æ‡§∞‡§§ (B-LOCATION) ‡§ï‡§æ (O) ‡§á‡§§‡•ç‡§Ø‡§æ‡§∏ (O) ‡§¨‡•á‡§π‡§¶ (O) ‡§™‡•Ç‡§∞‡§æ‡§®‡§æ (O) ‡§π‡•à‡•§ (O) ‡§Ü‡§∞‡•ç‡§ï‡§ø‡§Ø‡§≤‡•ã‡§ú‡§ø‡§ï‡§≤ (O) ‡§∏‡§∞‡•ç‡§µ‡•á‡§ú (O) ‡§∏‡•á (O) ‡§™‡§§‡§æ (O) ‡§ö‡§≤‡§§‡§æ (O) ‡§π‡•à (O) ‡§ï‡§ø (O) ‡§≠‡§æ‡§∞‡§§ (B-LOCATION) ‡§Æ‡•á‡§Ç (O) ‡§á‡§®‡•ç‡§∏‡§æ‡§®‡•Ä (O) ‡§ó‡§§‡§ø‡§µ‡§ø‡§¶‡•Ä (O) ‡§¶‡•ã (B-TIMEX) ‡§≤‡§æ‡§ñ (I-TIMEX) ‡§™‡§ö‡§æ‡§∏ (I-TIMEX) ‡§Ö‡§ú‡§æ‡§∞ (I-TIMEX) ‡§∏‡§æ‡§≤‡•ã‡§Ç (I-TIMEX) ‡§∏‡•á (O) ‡§≠‡•Ä (O) ‡§ú‡§æ‡§¶‡§æ (O) ‡§™‡•Ç‡§∞‡§æ‡§®‡•Ä (O) ‡§π‡•à‡•§ (O) ‡§µ‡•à‡§ó‡•ç‡§Ø‡§æ‡§®‡§ø‡§ï‡•ã‡§Ç (O) ‡§ï‡§æ (O) ‡§Æ‡§

Unnamed: 0,text,label,start,end
0,‡§≠‡§æ‡§∞‡§§,LOCATION,6,6
1,‡§≠‡§æ‡§∞‡§§,LOCATION,15,15
2,‡§≠‡§æ‡§∞‡§§,LOCATION,28,28
3,‡§¶‡•ã ‡§≤‡§æ‡§ñ ‡§™‡§ö‡§æ‡§∏ ‡§Ö‡§ú‡§æ‡§∞ ‡§∏‡§æ‡§≤‡•ã‡§Ç,TIMEX,32,36
4,10,NUMEX,60,60
5,12,NUMEX,62,62
6,‡§≠‡§æ‡§∞‡§§‡§≠‡§æ‡§∞‡§§,LOCATION,106,106
7,‡§≠‡§æ‡§∞‡§§,LOCATION,138,138
8,‡§≠‡§æ‡§∞‡§§,LOCATION,186,186
9,‡§≠‡§æ‡§∞‡§§,LOCATION,199,199



=== OOV Words Found: 323 (applying POS tagging) ===

Sample OOV words: ['‡§∏‡•ã‡§®‡•á', '‡§ï‡•Ä', '‡§ö‡§ø‡§°‡§ø‡§Ø‡§æ', '‡§ï‡§π‡§æ', '‡§ú‡§æ‡§®‡•á', '‡§µ‡§æ‡§≤‡§æ', '‡§¶‡•Å‡§®‡§ø‡§Ø‡§æ', '‡§ï‡•á', '‡§∏‡§¨‡§∏‡•á', '‡§™‡•ç‡§∞‡§æ‡§ö‡§ø‡§®']...

üîç POS tagging: 373 tokens, first 5: ['‡§∏‡•ã‡§®‡•á', '‡§ï‡•Ä', '‡§ö‡§ø‡§°‡§ø‡§Ø‡§æ', '‡§ï‡§π‡§æ', '‡§ú‡§æ‡§®‡•á']
üîç NER words: 348 words, first 5: ['‡§∏‡•ã‡§®‡•á', '‡§ï‡•Ä', '‡§ö‡§ø‡§°‡§ø‡§Ø‡§æ', '‡§ï‡§π‡§æ', '‡§ú‡§æ‡§®‡•á']
‚úÖ POS tagged 305 out of 323 OOV words
‚úÖ Applied POS tags to 283 OOV words:



Unnamed: 0,word,original_tag,pos_tag
0,‡§∏‡•ã‡§®‡•á,O,NOUN
1,‡§ï‡•Ä,O,ADP
2,‡§ö‡§ø‡§°‡§ø‡§Ø‡§æ,O,NOUN
3,‡§ï‡§π‡§æ,O,VERB
4,‡§ú‡§æ‡§®‡•á,O,AUX
...,...,...,...
278,‡§∂‡•Å‡§∞‡•Ç,O,NOUN
279,‡§π‡•ã‡§ï‡§∞,O,VERB
280,‡§™‡•Ç‡§∞‡•Ä,O,ADJ
281,‡§¶‡•Å‡§®‡§ø‡§Ø‡§æ,O,NOUN




=== Corrected NER Output (After POS Tagging) ===

‡§∏‡•ã‡§®‡•á (NOUN) ‡§ï‡•Ä (ADP) ‡§ö‡§ø‡§°‡§ø‡§Ø‡§æ (NOUN) ‡§ï‡§π‡§æ (VERB) ‡§ú‡§æ‡§®‡•á (AUX) ‡§µ‡§æ‡§≤‡§æ (ADJ) ‡§≠‡§æ‡§∞‡§§ (B-LOCATION) ‡§¶‡•Å‡§®‡§ø‡§Ø‡§æ (NOUN) ‡§ï‡•á (ADP) ‡§∏‡§¨‡§∏‡•á (ADV) ‡§™‡•ç‡§∞‡§æ‡§ö‡§ø‡§® (ADJ) ‡§¶‡•á‡§∂‡•ã‡§Ç (NOUN) ‡§Æ‡•à‡§Ç‡§∏‡•á (ADP) ‡§è‡§ï (NUM) ‡§π‡•à (AUX) ‡§≠‡§æ‡§∞‡§§ (B-LOCATION) ‡§ï‡§æ (ADP) ‡§á‡§§‡•ç‡§Ø‡§æ‡§∏ (NOUN) ‡§¨‡•á‡§π‡§¶ (ADV) ‡§™‡•Ç‡§∞‡§æ‡§®‡§æ (ADJ) ‡§π‡•à‡•§ (O) ‡§Ü‡§∞‡•ç‡§ï‡§ø‡§Ø‡§≤‡•ã‡§ú‡§ø‡§ï‡§≤ (B-MISC) ‡§∏‡§∞‡•ç‡§µ‡•á‡§ú (NOUN) ‡§∏‡•á (ADP) ‡§™‡§§‡§æ (NOUN) ‡§ö‡§≤‡§§‡§æ (VERB) ‡§π‡•à (AUX) ‡§ï‡§ø (SCONJ) ‡§≠‡§æ‡§∞‡§§ (B-LOCATION) ‡§Æ‡•á‡§Ç (ADP) ‡§á‡§®‡•ç‡§∏‡§æ‡§®‡•Ä (ADJ) ‡§ó‡§§‡§ø‡§µ‡§ø‡§¶‡•Ä (NOUN) ‡§¶‡•ã (B-TIMEX) ‡§≤‡§æ‡§ñ (I-TIMEX) ‡§™‡§ö‡§æ‡§∏ (I-TIMEX) ‡§Ö‡§ú‡§æ‡§∞ (I-TIMEX) ‡§∏‡§æ‡§≤‡•ã‡§Ç (I-TIMEX) ‡§∏‡•á (ADP) ‡§≠‡•Ä (PART) ‡§ú‡§æ‡§¶‡§æ (ADJ) ‡§™‡•Ç‡§∞‡§æ‡§®‡•Ä (ADJ) ‡§π‡•à‡•§ (O) ‡§µ‡•à‡§ó‡•ç‡§Ø‡§æ‡§®‡§ø‡§ï‡•ã‡§Ç (NOUN) ‡§ï‡§æ (ADP) ‡§Æ‡§®‡§®‡§æ (NOUN) ‡§π‡•à‡§Ç (

Unnamed: 0,text,label,start,end
0,‡§≠‡§æ‡§∞‡§§,LOCATION,6,6
1,‡§≠‡§æ‡§∞‡§§,LOCATION,15,15
2,‡§Ü‡§∞‡•ç‡§ï‡§ø‡§Ø‡§≤‡•ã‡§ú‡§ø‡§ï‡§≤,MISC,21,21
3,‡§≠‡§æ‡§∞‡§§,LOCATION,28,28
4,‡§¶‡•ã ‡§≤‡§æ‡§ñ ‡§™‡§ö‡§æ‡§∏ ‡§Ö‡§ú‡§æ‡§∞ ‡§∏‡§æ‡§≤‡•ã‡§Ç,TIMEX,32,36
5,‡§π‡•ã‡§Æ‡•ã ‡§∏‡•á‡§™‡§ø‡§Ø‡§Ç‡§∏,MISC,51,52
6,10,NUMEX,60,60
7,12,NUMEX,62,62
8,‡§π‡•Å‡§Æ‡•á‡§® ‡§è‡§ï‡•ç‡§ü‡§ø‡§µ‡§ø‡§ü‡•Ä‡§ú,MISC,97,98
9,‡§≠‡§æ‡§∞‡§§,LOCATION,106,106



=== Corrected Entity Counts ===



Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
MISC,15
LOCATION,13
NUMEX,3
TIMEX,2
RELIGION,1


In [None]:
# Install Gradio and soundfile for advanced UI
%pip -q install gradio soundfile


Final update 2.1

In [None]:
import gradio as gr
import tempfile
import json
from pathlib import Path
import pandas as pd
from typing import Tuple, List, Dict, Optional

# Custom CSS for professional dark theme UI with animations
custom_css = """
/* Main container styling - Dark theme */
.gradio-container {
    font-family: 'Inter', 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
    background: linear-gradient(135deg, #0a0e27 0%, #1a1a2e 50%, #16213e 100%);
    min-height: 100vh;
    color: #e0e0e0;
}

/* Animated background particles */
.gradio-container::before {
    content: '';
    position: fixed;
    top: 0;
    left: 0;
    width: 100%;
    height: 100%;
    background-image:
        radial-gradient(circle at 20% 50%, rgba(138, 43, 226, 0.1) 0%, transparent 50%),
        radial-gradient(circle at 80% 80%, rgba(0, 191, 255, 0.1) 0%, transparent 50%),
        radial-gradient(circle at 40% 20%, rgba(138, 43, 226, 0.05) 0%, transparent 50%);
    animation: backgroundShift 20s ease-in-out infinite;
    pointer-events: none;
    z-index: 0;
}

@keyframes backgroundShift {
    0%, 100% { opacity: 1; transform: scale(1); }
    50% { opacity: 0.8; transform: scale(1.1); }
}

/* Header styling - Dark with glow effect */
.header-section {
    background: linear-gradient(135deg, #1a0033 0%, #2d1b4e 50%, #1a1a2e 100%);
    padding: 35px;
    border-radius: 20px;
    margin-bottom: 25px;
    box-shadow:
        0 0 30px rgba(138, 43, 226, 0.4),
        0 0 60px rgba(0, 191, 255, 0.2),
        inset 0 0 20px rgba(138, 43, 226, 0.1);
    color: white;
    text-align: center;
    border: 2px solid rgba(138, 43, 226, 0.3);
    position: relative;
    overflow: hidden;
    animation: headerGlow 3s ease-in-out infinite;
}

@keyframes headerGlow {
    0%, 100% { box-shadow: 0 0 30px rgba(138, 43, 226, 0.4), 0 0 60px rgba(0, 191, 255, 0.2), inset 0 0 20px rgba(138, 43, 226, 0.1); }
    50% { box-shadow: 0 0 40px rgba(138, 43, 226, 0.6), 0 0 80px rgba(0, 191, 255, 0.3), inset 0 0 30px rgba(138, 43, 226, 0.2); }
}

.header-section::before {
    content: '';
    position: absolute;
    top: -50%;
    left: -50%;
    width: 200%;
    height: 200%;
    background: radial-gradient(circle, rgba(138, 43, 226, 0.1) 0%, transparent 70%);
    animation: pulse 4s ease-in-out infinite;
}

@keyframes pulse {
    0%, 100% { transform: scale(1) rotate(0deg); opacity: 0.5; }
    50% { transform: scale(1.2) rotate(180deg); opacity: 0.8; }
}

.header-section h1 {
    margin: 0;
    font-size: 2.5em;
    font-weight: 700;
    background: linear-gradient(45deg, #00d4ff, #8a2be2, #00d4ff);
    background-size: 200% auto;
    -webkit-background-clip: text;
    -webkit-text-fill-color: transparent;
    background-clip: text;
    animation: gradientShift 3s linear infinite;
    text-shadow: 0 0 30px rgba(0, 212, 255, 0.5);
    position: relative;
    z-index: 1;
}

@keyframes gradientShift {
    0% { background-position: 0% center; }
    100% { background-position: 200% center; }
}

.header-section p {
    margin: 10px 0 0 0;
    font-size: 1.1em;
    opacity: 0.9;
    position: relative;
    z-index: 1;
    animation: fadeInUp 1s ease-out;
}

@keyframes fadeInUp {
    from { opacity: 0; transform: translateY(20px); }
    to { opacity: 0.9; transform: translateY(0); }
}

/* Card styling - Dark theme */
.card {
    background: rgba(26, 26, 46, 0.8);
    border-radius: 15px;
    padding: 25px;
    margin: 15px 0;
    box-shadow: 0 8px 32px rgba(0, 0, 0, 0.3);
    border: 1px solid rgba(138, 43, 226, 0.2);
    transition: all 0.4s cubic-bezier(0.175, 0.885, 0.32, 1.275);
    backdrop-filter: blur(10px);
}

.card:hover {
    transform: translateY(-8px) scale(1.02);
    box-shadow: 0 15px 40px rgba(138, 43, 226, 0.4), 0 0 20px rgba(0, 191, 255, 0.2);
    border-color: rgba(138, 43, 226, 0.5);
}

/* Button styling - Animated gradient */
.primary-button, button.primary {
    background: linear-gradient(135deg, #8a2be2 0%, #00d4ff 50%, #8a2be2 100%);
    background-size: 200% auto;
    border: none;
    border-radius: 12px;
    padding: 15px 30px;
    font-size: 1.1em;
    font-weight: 600;
    color: white;
    cursor: pointer;
    transition: all 0.3s ease;
    box-shadow: 0 8px 20px rgba(138, 43, 226, 0.4);
    animation: buttonGradient 3s linear infinite;
    position: relative;
    overflow: hidden;
}

@keyframes buttonGradient {
    0% { background-position: 0% center; }
    100% { background-position: 200% center; }
}

.primary-button::before, button.primary::before {
    content: '';
    position: absolute;
    top: 0;
    left: -100%;
    width: 100%;
    height: 100%;
    background: linear-gradient(90deg, transparent, rgba(255, 255, 255, 0.3), transparent);
    transition: left 0.5s;
}

.primary-button:hover::before, button.primary:hover::before {
    left: 100%;
}

.primary-button:hover, button.primary:hover {
    transform: translateY(-3px) scale(1.05);
    box-shadow: 0 12px 30px rgba(138, 43, 226, 0.6), 0 0 20px rgba(0, 191, 255, 0.4);
}

/* Entity tags styling - Enhanced animations */
.entity-tag {
    display: inline-block;
    padding: 6px 14px;
    margin: 4px;
    border-radius: 20px;
    font-size: 0.9em;
    font-weight: 600;
    color: white;
    text-shadow: 0 2px 4px rgba(0, 0, 0, 0.3);
    animation: tagFadeIn 0.6s cubic-bezier(0.68, -0.55, 0.265, 1.55);
    transition: all 0.3s ease;
    box-shadow: 0 4px 15px rgba(0, 0, 0, 0.2);
}

.entity-tag:hover {
    transform: scale(1.1) rotate(2deg);
    box-shadow: 0 6px 20px rgba(0, 0, 0, 0.3);
}

@keyframes tagFadeIn {
    from {
        opacity: 0;
        transform: translateY(-20px) scale(0.8) rotate(-10deg);
    }
    to {
        opacity: 1;
        transform: translateY(0) scale(1) rotate(0deg);
    }
}

.entity-PERSON {
    background: linear-gradient(135deg, #ff6b9d 0%, #c44569 100%);
    animation-delay: 0.1s;
}
.entity-LOCATION {
    background: linear-gradient(135deg, #4facfe 0%, #00f2fe 100%);
    animation-delay: 0.2s;
}
.entity-ORGANIZATION {
    background: linear-gradient(135deg, #43e97b 0%, #38f9d7 100%);
    animation-delay: 0.3s;
}
.entity-MISC {
    background: linear-gradient(135deg, #fa709a 0%, #fee140 100%);
    animation-delay: 0.4s;
}
.entity-DATE {
    background: linear-gradient(135deg, #30cfd0 0%, #330867 100%);
    animation-delay: 0.5s;
}
.entity-TIME {
    background: linear-gradient(135deg, #a8edea 0%, #fed6e3 100%);
    animation-delay: 0.6s;
}

/* Text output styling - Dark theme */
.text-output, textarea, input[type="text"] {
    background: rgba(22, 33, 62, 0.8) !important;
    color: #e0e0e0 !important;
    border-radius: 12px;
    padding: 20px;
    font-family: 'Courier New', monospace;
    font-size: 1.05em;
    line-height: 1.8;
    border: 2px solid rgba(138, 43, 226, 0.3) !important;
    max-height: 500px;
    overflow-y: auto;
    transition: all 0.3s ease;
}

.text-output:focus, textarea:focus, input[type="text"]:focus {
    border-color: rgba(138, 43, 226, 0.6) !important;
    box-shadow: 0 0 20px rgba(138, 43, 226, 0.3);
    outline: none;
}

/* Table styling - Dark theme */
.dataframe {
    border-radius: 12px;
    overflow: hidden;
    box-shadow: 0 8px 32px rgba(0, 0, 0, 0.3);
    background: rgba(26, 26, 46, 0.9) !important;
    animation: tableSlideIn 0.5s ease-out;
}

@keyframes tableSlideIn {
    from { opacity: 0; transform: translateX(-20px); }
    to { opacity: 1; transform: translateX(0); }
}

.dataframe thead {
    background: linear-gradient(135deg, #8a2be2 0%, #00d4ff 100%) !important;
    color: white !important;
}

.dataframe tbody {
    background: rgba(22, 33, 62, 0.8) !important;
    color: #e0e0e0 !important;
}

.dataframe tbody tr {
    transition: all 0.3s ease;
}

.dataframe tbody tr:nth-child(even) {
    background: rgba(26, 26, 46, 0.6) !important;
}

.dataframe tbody tr:hover {
    background: rgba(138, 43, 226, 0.2) !important;
    transform: scale(1.01);
    box-shadow: 0 4px 15px rgba(138, 43, 226, 0.3);
}

/* Stats cards - Dark theme */
.stats-card {
    background: linear-gradient(135deg, rgba(138, 43, 226, 0.3) 0%, rgba(0, 212, 255, 0.3) 100%);
    color: white;
    padding: 20px;
    border-radius: 15px;
    text-align: center;
    margin: 10px;
    box-shadow: 0 8px 25px rgba(138, 43, 226, 0.4);
    border: 1px solid rgba(138, 43, 226, 0.3);
    animation: statsPulse 2s ease-in-out infinite;
}

@keyframes statsPulse {
    0%, 100% { transform: scale(1); }
    50% { transform: scale(1.02); }
}

.stats-number {
    font-size: 2.5em;
    font-weight: 700;
    margin: 10px 0;
    text-shadow: 0 0 20px rgba(0, 212, 255, 0.5);
    animation: numberCount 1s ease-out;
}

@keyframes numberCount {
    from { transform: scale(0); opacity: 0; }
    to { transform: scale(1); opacity: 1; }
}

.stats-label {
    font-size: 1.1em;
    opacity: 0.9;
}

/* Loading animation - Enhanced */
.loading {
    display: inline-block;
    width: 24px;
    height: 24px;
    border: 3px solid rgba(138, 43, 226, 0.3);
    border-radius: 50%;
    border-top-color: #8a2be2;
    border-right-color: #00d4ff;
    animation: spin 0.8s linear infinite;
}

@keyframes spin {
    to { transform: rotate(360deg); }
}

/* Progress bar - Dark theme */
.progress-bar {
    width: 100%;
    height: 10px;
    background: rgba(26, 26, 46, 0.8);
    border-radius: 10px;
    overflow: hidden;
    margin: 10px 0;
    border: 1px solid rgba(138, 43, 226, 0.3);
}

.progress-fill {
    height: 100%;
    background: linear-gradient(90deg, #8a2be2 0%, #00d4ff 100%);
    border-radius: 10px;
    animation: progress 2s ease, progressGlow 2s ease-in-out infinite;
    box-shadow: 0 0 10px rgba(138, 43, 226, 0.5);
}

@keyframes progress {
    from { width: 0%; }
}

@keyframes progressGlow {
    0%, 100% { box-shadow: 0 0 10px rgba(138, 43, 226, 0.5); }
    50% { box-shadow: 0 0 20px rgba(0, 212, 255, 0.8); }
}

/* Upload area styling - Dark theme */
.upload-area {
    border: 3px dashed rgba(138, 43, 226, 0.5);
    border-radius: 15px;
    padding: 40px;
    text-align: center;
    background: rgba(22, 33, 62, 0.5);
    transition: all 0.4s ease;
    position: relative;
    overflow: hidden;
}

.upload-area::before {
    content: '';
    position: absolute;
    top: -50%;
    left: -50%;
    width: 200%;
    height: 200%;
    background: linear-gradient(45deg, transparent, rgba(138, 43, 226, 0.1), transparent);
    animation: uploadShine 3s linear infinite;
}

@keyframes uploadShine {
    0% { transform: translateX(-100%) translateY(-100%) rotate(45deg); }
    100% { transform: translateX(100%) translateY(100%) rotate(45deg); }
}

.upload-area:hover {
    background: rgba(138, 43, 226, 0.1);
    border-color: rgba(138, 43, 226, 0.8);
    transform: scale(1.02);
    box-shadow: 0 0 30px rgba(138, 43, 226, 0.4);
}

/* Section headers - Dark theme */
.section-header {
    font-size: 1.5em;
    font-weight: 700;
    color: #e0e0e0;
    margin: 20px 0 15px 0;
    padding-bottom: 10px;
    border-bottom: 3px solid rgba(138, 43, 226, 0.5);
    text-shadow: 0 0 10px rgba(138, 43, 226, 0.3);
    animation: headerSlide 0.5s ease-out;
}

@keyframes headerSlide {
    from { opacity: 0; transform: translateX(-20px); }
    to { opacity: 1; transform: translateX(0); }
}

/* Tabs styling - Dark theme */
.tab-nav {
    background: rgba(26, 26, 46, 0.8) !important;
    border-radius: 10px 10px 0 0;
}

.tab-nav button {
    color: #e0e0e0 !important;
    transition: all 0.3s ease;
}

.tab-nav button:hover {
    background: rgba(138, 43, 226, 0.2) !important;
    transform: translateY(-2px);
}

.tab-nav button.selected {
    background: linear-gradient(135deg, #8a2be2 0%, #00d4ff 100%) !important;
    color: white !important;
    box-shadow: 0 4px 15px rgba(138, 43, 226, 0.4);
}

/* Markdown text - Dark theme */
.markdown, .markdown p, .markdown h1, .markdown h2, .markdown h3 {
    color: #e0e0e0 !important;
}

/* Scrollbar styling - Dark theme */
::-webkit-scrollbar {
    width: 10px;
    height: 10px;
}

::-webkit-scrollbar-track {
    background: rgba(26, 26, 46, 0.5);
    border-radius: 10px;
}

::-webkit-scrollbar-thumb {
    background: linear-gradient(135deg, #8a2be2 0%, #00d4ff 100%);
    border-radius: 10px;
    transition: background 0.3s ease;
}

::-webkit-scrollbar-thumb:hover {
    background: linear-gradient(135deg, #00d4ff 0%, #8a2be2 100%);
}

/* General dark theme overrides */
.dark-bg {
    background: rgba(22, 33, 62, 0.8) !important;
    color: #e0e0e0 !important;
}

/* Fade in animation for all content */
@keyframes fadeIn {
    from { opacity: 0; transform: translateY(20px); }
    to { opacity: 1; transform: translateY(0); }
}

.gradio-container > * {
    animation: fadeIn 0.6s ease-out;
}
"""

# Entity color mapping for visualization
ENTITY_COLORS = {
    'PERSON': '#f5576c',
    'LOCATION': '#4facfe',
    'ORGANIZATION': '#43e97b',
    'MISC': '#fa709a',
    'DATE': '#30cfd0',
    'TIME': '#a8edea',
    'O': '#e0e0e0'
}

def get_entity_color(label: str) -> str:
    """Get color for entity label"""
    for key, color in ENTITY_COLORS.items():
        if key in label.upper():
            return color
    return ENTITY_COLORS.get('O', '#e0e0e0')

def format_tagged_text(words: List[str], labels: List[str]) -> str:
    """Format tagged text with HTML for color coding"""
    html_parts = []
    for word, label in zip(words, labels):
        if label.startswith('B-') or label.startswith('I-'):
            entity_type = label.split('-', 1)[1] if '-' in label else label
            color = get_entity_color(entity_type)
            html_parts.append(
                f'<span style="background: {color}; color: white; padding: 3px 8px; '
                f'border-radius: 5px; margin: 2px; display: inline-block; font-weight: 600;">'
                f'{word} <small>({entity_type})</small></span>'
            )
        elif label.startswith('O (') and ')' in label:
            pos_tag = label[3:-1]
            html_parts.append(
                f'<span style="background: #fff3cd; color: #856404; padding: 3px 8px; '
                f'border-radius: 5px; margin: 2px; display: inline-block;">'
                f'{word} <small>({pos_tag})</small></span>'
            )
        else:
            html_parts.append(f'<span style="margin: 2px; display: inline-block;">{word}</span>')
    return ' '.join(html_parts)

def process_audio(audio_file) -> Tuple[str, str, pd.DataFrame, pd.DataFrame, pd.DataFrame, Dict]:
    """
    Main processing function for Gradio interface
    Returns: transcription, tagged_text_html, word_table_df, entities_df, stats_df, summary_dict
    """
    if audio_file is None:
        return "‚ùå Please upload an audio file", "", pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), {}

    try:
        # Handle Gradio audio file input
        temp_path = None
        if isinstance(audio_file, tuple):
            # Or it might be a file path string
            if isinstance(audio_file[0], (int, float)):
                # It's audio data, save to temp file
                import tempfile
                import soundfile as sf
                temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.wav')
                sf.write(temp_file.name, audio_file[1], int(audio_file[0]))
                temp_path = Path(temp_file.name)
            else:
                temp_path = Path(audio_file)
        elif isinstance(audio_file, str):
            temp_path = Path(audio_file)
        else:
            temp_path = Path(audio_file)

        # Step 1: Transcribe audio
        transcribed_text = transcriber.transcribe(temp_path)

        # Clean up temp file if created
        if temp_path and temp_path.exists() and 'tempfile' in str(temp_path):
            try:
                temp_path.unlink()
            except:
                pass

        if not transcribed_text:
            return "‚ö†Ô∏è Transcription failed or returned empty text", "", pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), {}

        # Apply spell check
        corrected_text = spell_checker.correct_text(transcribed_text)

        # Step 2: Run NER
        preds_original, entities_original = run_ner_in_chunks(corrected_text, chunk_size=120, overlap=20)

        words = corrected_text.split()

        # Step 3: Apply POS tagging to OOV words
        preds_corrected, entities_corrected = apply_pos_tagging_to_oov(corrected_text, preds_original, words)

        # Format tagged text with HTML
        tagged_html = format_tagged_text(words, preds_corrected)

        # Create table with full tag names as columns and words grouped by tag in rows
        # Group words by their full tag names (B-LOCATION, I-PERSON, etc.)
        # Use Counter to count word occurrences per tag
        from collections import Counter
        words_by_tag = {}  # full_tag -> Counter of words

        # Define the order of tags for consistent column ordering (only NER tags, no POS tags)
        tag_order = [
            'B-FESTIVAL', 'I-FESTIVAL',
            'B-GAME', 'I-GAME',
            'B-LANGUAGE', 'I-LANGUAGE',
            'B-LITERATURE', 'I-LITERATURE',
            'B-LOCATION', 'I-LOCATION',
            'B-MISC', 'I-MISC',
            'B-NUMEX', 'I-NUMEX',
            'B-ORGANIZATION', 'I-ORGANIZATION',
            'B-PERSON', 'I-PERSON',
            'B-RELIGION', 'I-RELIGION',
            'B-TIMEX', 'I-TIMEX',
            'Out of Vocabulary'
        ]

        for word, tag in zip(words, preds_corrected):
            # Normalize tag to match the expected format
            normalized_tag = tag

            # Handle POS tags (O (POS_TAG) format) - skip these, only show NER tags
            if tag.startswith('O (') and ')' in tag:
                continue  # Skip POS tags, only show NER tags
            elif tag == 'O' or tag.upper() == 'OUT OF VOCABULARY' or 'out of vocabulary' in tag.lower():
                normalized_tag = 'Out of Vocabulary'
            # Keep B- and I- tags as is

            # Only process tags that are in our predefined list
            if normalized_tag not in tag_order:
                continue  # Skip tags not in the predefined list

            # Add word to the appropriate tag counter
            if normalized_tag not in words_by_tag:
                words_by_tag[normalized_tag] = Counter()
            words_by_tag[normalized_tag][word] += 1

        # Filter to only show tags that are in the output and in the predefined list
        available_tags = []
        for tag in tag_order:
            if tag in words_by_tag:
                available_tags.append(tag)

        # Convert Counter to list of unique words with counts
        # Format: word(count) if count > 1, else just word
        words_by_tag_formatted = {}
        for tag in available_tags:
            word_counter = words_by_tag[tag]
            formatted_words = []
            for word, count in word_counter.items():
                if count > 1:
                    formatted_words.append(f"{word}({count})")
                else:
                    formatted_words.append(word)
            words_by_tag_formatted[tag] = formatted_words

        # Find the maximum number of unique words in any tag to determine number of rows
        max_words = max([len(words_by_tag_formatted[tag]) for tag in available_tags]) if available_tags else 0

        # Create DataFrame with tags as columns
        word_table_data = []
        for row_idx in range(max_words):
            row_data = {}
            for tag in available_tags:
                words_list = words_by_tag_formatted[tag]
                if row_idx < len(words_list):
                    row_data[tag] = words_list[row_idx]
                else:
                    row_data[tag] = ''  # Empty cell if no more words for this tag
            word_table_data.append(row_data)

        word_table_df = pd.DataFrame(word_table_data)

        # Create entities DataFrame
        if entities_corrected:
            entities_df = pd.DataFrame(entities_corrected)
            # Add color column for visualization
            entities_df['color'] = entities_df['label'].apply(get_entity_color)
        else:
            entities_df = pd.DataFrame(columns=['text', 'label', 'start', 'end'])

        # Create statistics
        if entities_corrected:
            entity_counts = pd.DataFrame(entities_corrected).groupby('label').size().reset_index(name='count')
            entity_counts.columns = ['Entity Type', 'Count']
            entity_counts = entity_counts.sort_values('Count', ascending=False)
        else:
            entity_counts = pd.DataFrame(columns=['Entity Type', 'Count'])

        # Summary statistics
        summary = {
            'total_words': len(words),
            'total_entities': len(entities_corrected),
            'entity_types': len(set([e['label'] for e in entities_corrected])) if entities_corrected else 0,
            'transcription_length': len(corrected_text)
        }

        return corrected_text, tagged_html, word_table_df, entities_df, entity_counts, summary

    except Exception as e:
        import traceback
        error_msg = f"‚ùå Error: {str(e)}\n\n{traceback.format_exc()}"
        return error_msg, "", pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), {}


def run_dataset_evaluation_ui(dataset_file, max_samples_input):
    """Wrapper used by the Gradio Evaluation tab."""
    try:
        if dataset_file:
            dataset_path = Path(dataset_file)
        else:
            default_path = Path("HindiNER.xlsx")
            if default_path.exists():
                dataset_path = default_path
            else:
                return pd.DataFrame(), {}, "‚ùå Please upload a labelled dataset (.xlsx or .csv)."

        max_samples = None
        if max_samples_input is not None and str(max_samples_input).strip():
            try:
                max_samples = int(float(str(max_samples_input).strip()))
                if max_samples <= 0:
                    max_samples = None
            except (TypeError, ValueError):
                return pd.DataFrame(), {}, "‚ö†Ô∏è Invalid max samples value. Provide a positive integer or leave blank."

        metrics = evaluate_model_on_dataset(
            dataset_path=str(dataset_path),
            max_samples=max_samples,
            verbose=False,
        )
        metrics_df = metrics_to_dataframe(metrics).reset_index()
        metadata = metrics.get("metadata", {})
        return metrics_df, metadata, "‚úÖ Evaluation completed successfully."

    except Exception as e:
        return pd.DataFrame(), {}, f"‚ùå Evaluation failed: {e}"


# Create Gradio interface
def create_gradio_ui():
    """Create the Gradio interface with custom styling"""

    # Create custom dark theme - using Monochrome as base for better compatibility
    try:
        dark_theme = gr.themes.Monochrome(
            primary_hue="purple",
            secondary_hue="cyan",
            neutral_hue="slate",
        )
    except Exception:
        # Fallback to default Monochrome if customization fails
        try:
            dark_theme = gr.themes.Monochrome()
        except Exception:
            # Final fallback to Default theme
            dark_theme = gr.themes.Default()

    with gr.Blocks(css=custom_css, theme=dark_theme) as demo:
        # Header
        gr.HTML("""
        <div class="header-section">
            <h1>üéµ Hindi Audio ‚Üí NER + POS Tagging</h1>
            <p>Advanced Named Entity Recognition with Part-of-Speech Tagging for Hindi Audio</p>
        </div>
        """)

        with gr.Row():
            with gr.Column(scale=1):
                gr.Markdown("### üì§ Upload Audio File")
                audio_input = gr.Audio(
                    label="Audio File",
                    type="filepath",
                    sources=["upload"]
                )

                process_btn = gr.Button(
                    "üöÄ Process Audio",
                    variant="primary",
                    size="lg"
                )

                gr.Markdown("""
                ### ‚ÑπÔ∏è Instructions
                1. Upload a Hindi audio file (WAV, MP3, M4A, FLAC, or OGG)
                2. Click "Process Audio" to start transcription and NER analysis
                3. View results in the tabs below

                **Supported formats:** WAV, MP3, M4A, FLAC, OGG
                """)

            with gr.Column(scale=2):
                with gr.Tabs():
                    with gr.Tab("üìã Word Table"):
                        word_table = gr.Dataframe(
                            label="Words Grouped by Tag Type (Only tags present in output are shown)",
                            interactive=False,
                            wrap=True
                        )
                    with gr.Tab("üìù Transcription"):
                        transcription_output = gr.Textbox(
                            label="Transcribed Text",
                            lines=10,
                            interactive=False,
                            show_copy_button=True
                        )

                    with gr.Tab("üè∑Ô∏è Tagged Text"):
                        tagged_output = gr.HTML(
                            label="NER + POS Tagged Text",
                            value="<p style='text-align: center; color: #666;'>Upload and process audio to see tagged text here...</p>"
                        )

                    with gr.Tab("üìä Entities"):
                        entities_table = gr.Dataframe(
                            label="Detected Entities",
                            interactive=False,
                            wrap=True,
                            column_widths=["40%", "30%", "15%", "15%"]
                        )

                    with gr.Tab("üìà Statistics"):
                        stats_table = gr.Dataframe(
                            label="Entity Statistics",
                            interactive=False,
                            wrap=True
                        )

                        stats_summary = gr.JSON(
                            label="Summary Statistics",
                            value={}
                        )

                    with gr.Tab("üìè Evaluation"):
                        gr.Markdown("### üìè Evaluate on Labelled Dataset")
                        gr.Markdown(
                            "Upload a labelled dataset (columns: `tokens`, `ner_tags`) or leave blank to use `HindiNER.xlsx` if present."
                        )
                        eval_dataset = gr.File(
                            label="Dataset File (.xlsx or .csv)",
                            file_types=[".xlsx", ".csv"],
                            file_count="single",
                            type="filepath"
                        )
                        eval_max_samples = gr.Textbox(
                            label="Max Samples (optional)",
                            placeholder="Leave blank to evaluate on all samples"
                        )
                        eval_button = gr.Button(
                            "‚ñ∂Ô∏è Run Evaluation",
                            variant="primary"
                        )
                        eval_metrics_table = gr.Dataframe(
                            label="Evaluation Metrics",
                            interactive=False,
                            wrap=True
                        )
                        eval_metadata = gr.JSON(
                            label="Evaluation Metadata",
                            value={}
                        )
                        eval_status = gr.Textbox(
                            label="Evaluation Status",
                            lines=3,
                            interactive=False,
                            value="Click 'Run Evaluation' to compute metrics."
                        )

        # Processing status
        status_output = gr.Textbox(
            label="Processing Status",
            lines=3,
            interactive=False,
            value="Ready to process audio..."
        )

        # Connect the processing function
        process_btn.click(
            fn=process_audio,
            inputs=[audio_input],
            outputs=[transcription_output, tagged_output, word_table, entities_table, stats_table, stats_summary],
            show_progress=True
        )

        eval_button.click(
            fn=run_dataset_evaluation_ui,
            inputs=[eval_dataset, eval_max_samples],
            outputs=[eval_metrics_table, eval_metadata, eval_status],
            show_progress=True
        )

    return demo

# Create and launch the interface
print("üöÄ Creating Gradio interface...")
demo = create_gradio_ui()
# For Google Colab, use share=True to get a public link
# For local use, set share=False
print("‚úÖ Interface created! Launching...")
demo.launch(share=True, server_name="0.0.0.0", server_port=7866)

üöÄ Creating Gradio interface...
‚úÖ Interface created! Launching...
Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://588d627b6de008bc0f.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


