In [26]:

import os
import json
import numpy as np
import tensorflow as tf
from tensorflow import keras
import librosa
import soundfile as sf
import pickle
from typing import List, Optional
import matplotlib.pyplot as plt
import IPython.display as ipd
import warnings
warnings.filterwarnings('ignore')

In [27]:
# Configuration
CONFIG = {
    'SAMPLE_RATE': 16000,
    'MAX_AUDIO_LENGTH': 16000 * 10,  # 10 seconds
    'MAX_TEXT_LENGTH': 100,
    'N_MELS': 80,
    'FIXED_SEQUENCE_LENGTH': 625
}

In [28]:
class SinhalaSTTTester:
    """Enhanced Testing class for Sinhala STT model with debugging"""
    
    def __init__(self):
        self.model = None
        self.char_to_idx = {}
        self.idx_to_char = {}
        self.vocab_size = 0
        self.interpreter = None
        self.input_details = None
        self.output_details = None
        
        # Create basic vocabulary
        self.create_basic_vocabulary()
    
    def create_basic_vocabulary(self):
        """Create a basic Sinhala vocabulary for testing"""
        print("Creating basic Sinhala vocabulary...")
        
        # Basic Sinhala characters and common words
        sinhala_chars = [
            'අ', 'ආ', 'ඇ', 'ඈ', 'ඉ', 'ඊ', 'උ', 'ඌ', 'ඍ', 'ඎ', 'ඏ', 'ඐ', 'එ', 'ඒ', 'ඓ', 'ඔ', 'ඕ', 'ඖ',
            'ක', 'ඛ', 'ග', 'ඝ', 'ඞ', 'ච', 'ඡ', 'ජ', 'ඣ', 'ඤ', 'ට', 'ඨ', 'ඩ', 'ඪ', 'ණ', 'ත', 'ථ', 'ද', 'ධ', 'න',
            'ප', 'ඵ', 'බ', 'භ', 'ම', 'ය', 'ර', 'ල', 'ව', 'ශ', 'ෂ', 'ස', 'හ', 'ළ', 'ෆ',
            'ා', 'ි', 'ී', 'ු', 'ූ', 'ෘ', 'ෙ', 'ේ', 'ෛ', 'ො', 'ෝ', 'ෞ', 'ං', 'ඃ', '්',
            ' ', '.', ',', '?', '!', '-', ':', ';', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9'
        ]
        
        # Add special tokens
        special_tokens = ['<pad>', '<sos>', '<eos>', '<unk>']
        vocab = special_tokens + sinhala_chars
        
        self.char_to_idx = {char: idx for idx, char in enumerate(vocab)}
        self.idx_to_char = {idx: char for char, idx in self.char_to_idx.items()}
        self.vocab_size = len(vocab)
        
        print(f"✓ Basic vocabulary created with {self.vocab_size} characters")
        print(f"✓ Special tokens: {special_tokens}")
        print(f"✓ SOS token index: {self.char_to_idx['<sos>']}")
        print(f"✓ EOS token index: {self.char_to_idx['<eos>']}")
    

In [29]:
    def load_model(self, model_path: str):
        """Load the trained model with enhanced debugging"""
        try:
            print(f"\n🔧 Loading model from: {model_path}")
            print(f"📁 File exists: {os.path.exists(model_path)}")
            print(f"📊 File size: {os.path.getsize(model_path) / 1024 / 1024:.2f} MB")
            
            if model_path.endswith('.tflite'):
                # Load TFLite model
                print("🚀 Loading TFLite model...")
                self.interpreter = tf.lite.Interpreter(model_path=model_path)
                self.interpreter.allocate_tensors()
                self.input_details = self.interpreter.get_input_details()
                self.output_details = self.interpreter.get_output_details()
                
                print("✓ TFLite model loaded successfully")
                print(f"📊 Input details:")
                for i, detail in enumerate(self.input_details):
                    print(f"   Input {i}: {detail['name']} - Shape: {detail['shape']} - Type: {detail['dtype']}")
                print(f"📊 Output details:")
                for i, detail in enumerate(self.output_details):
                    print(f"   Output {i}: {detail['name']} - Shape: {detail['shape']} - Type: {detail['dtype']}")
                    
            else:
                # Load Keras model
                print("🚀 Loading Keras model...")
                self.model = keras.models.load_model(model_path)
                print("✓ Keras model loaded successfully")
                
                # Print model structure
                print("\n📊 Model Architecture:")
                self.model.summary()
                
                # Check input/output shapes
                print(f"\n📊 Model Input Shape: {self.model.input_shape}")
                print(f"📊 Model Output Shape: {self.model.output_shape}")
                
        except Exception as e:
            print(f"❌ Error loading model: {e}")
            import traceback
            traceback.print_exc()
            self.model = None
            return False
        return True
    
    def load_vocabulary(self, vocab_path: str):
        """Load vocabulary from file with debugging"""
        try:
            print(f"\n🔧 Loading vocabulary from: {vocab_path}")
            print(f"📁 File exists: {os.path.exists(vocab_path)}")
            
            with open(vocab_path, 'rb') as f:
                vocab_data = pickle.load(f)
            
            print(f"📊 Vocabulary file contents: {list(vocab_data.keys())}")
            
            self.char_to_idx = vocab_data['char_to_idx']
            self.idx_to_char = vocab_data['idx_to_char']
            self.vocab_size = vocab_data['vocab_size']
            
            print(f"✓ Vocabulary loaded: {self.vocab_size} characters")
            print(f"✓ Sample characters: {list(self.char_to_idx.keys())[:10]}")
            return True
        except Exception as e:
            print(f"❌ Error loading vocabulary: {e}")
            import traceback
            traceback.print_exc()
            return False
    
    def display_audio_info(self, audio_path: str):
        """Display audio information and waveform"""
        try:
            # Load audio
            audio, sr = librosa.load(audio_path, sr=None)
            
            print(f"\n📊 Audio Information:")
            print(f"  - File: {os.path.basename(audio_path)}")
            print(f"  - Sample rate: {sr} Hz")
            print(f"  - Duration: {len(audio)/sr:.2f} seconds")
            print(f"  - Samples: {len(audio)}")
            print(f"  - Audio range: [{audio.min():.4f}, {audio.max():.4f}]")
            print(f"  - Audio mean: {audio.mean():.4f}")
            print(f"  - Audio std: {audio.std():.4f}")
            
            # Check for silence
            if np.abs(audio).max() < 0.01:
                print("⚠️  WARNING: Audio seems very quiet, might be silence")
            
            # Display audio player
            print("\n🎵 Audio Player:")
            ipd.display(ipd.Audio(audio_path))
            
            # Plot waveform
            plt.figure(figsize=(12, 4))
            plt.subplot(1, 2, 1)
            time = np.linspace(0, len(audio)/sr, len(audio))
            plt.plot(time, audio)
            plt.title('Waveform')
            plt.xlabel('Time (s)')
            plt.ylabel('Amplitude')
            plt.grid(True)
            
            # Plot spectrogram
            plt.subplot(1, 2, 2)
            D = librosa.stft(audio)
            S_db = librosa.amplitude_to_db(np.abs(D), ref=np.max)
            librosa.display.specshow(S_db, sr=sr, x_axis='time', y_axis='hz')
            plt.title('Spectrogram')
            plt.colorbar(format='%+2.0f dB')
            
            plt.tight_layout()
            plt.show()
            
        except Exception as e:
            print(f"❌ Error displaying audio info: {e}")
            import traceback
            traceback.print_exc()
    
    def preprocess_audio(self, audio_path: str) -> np.ndarray:
        """Preprocess audio file to mel spectrogram features with debugging"""
        try:
            print(f"\n🎤 Processing audio: {os.path.basename(audio_path)}")
            
            # Load audio
            audio, sr = librosa.load(audio_path, sr=CONFIG['SAMPLE_RATE'])
            print(f"   - Original sample rate: {sr} Hz")
            print(f"   - Target sample rate: {CONFIG['SAMPLE_RATE']} Hz")
            print(f"   - Original length: {len(audio)} samples ({len(audio)/CONFIG['SAMPLE_RATE']:.2f} seconds)")
            print(f"   - Audio range: [{audio.min():.4f}, {audio.max():.4f}]")
            
            # Check for silence
            if np.abs(audio).max() < 0.01:
                print("⚠️  WARNING: Audio seems very quiet, might be silence")
            
            # Pad or trim audio to fixed length
            if len(audio) > CONFIG['MAX_AUDIO_LENGTH']:
                audio = audio[:CONFIG['MAX_AUDIO_LENGTH']]
                print(f"   - Audio trimmed to {CONFIG['MAX_AUDIO_LENGTH']} samples")
            else:
                audio = np.pad(audio, (0, CONFIG['MAX_AUDIO_LENGTH'] - len(audio)), 'constant')
                print(f"   - Audio padded to {CONFIG['MAX_AUDIO_LENGTH']} samples")
            
            # Extract mel spectrogram
            print(f"   - Extracting mel spectrogram with {CONFIG['N_MELS']} mel bands")
            mel_spec = librosa.feature.melspectrogram(
                y=audio, 
                sr=CONFIG['SAMPLE_RATE'],
                n_mels=CONFIG['N_MELS'],
                n_fft=1024,
                hop_length=256,
                win_length=1024
            )
            
            print(f"   - Mel spectrogram shape before processing: {mel_spec.shape}")
            
            # Convert to log scale and normalize
            mel_spec = librosa.power_to_db(mel_spec, ref=np.max)
            print(f"   - Mel spec range after log: [{mel_spec.min():.2f}, {mel_spec.max():.2f}]")
            
            mel_spec = (mel_spec - np.mean(mel_spec)) / (np.std(mel_spec) + 1e-8)
            print(f"   - Mel spec range after normalization: [{mel_spec.min():.2f}, {mel_spec.max():.2f}]")
            
            # Transpose and ensure fixed dimensions
            mel_spec = mel_spec.T  # Shape: (time_steps, n_mels)
            print(f"   - Mel spec shape after transpose: {mel_spec.shape}")
            
            # Fixed sequence length for compatibility
            if mel_spec.shape[0] > CONFIG['FIXED_SEQUENCE_LENGTH']:
                mel_spec = mel_spec[:CONFIG['FIXED_SEQUENCE_LENGTH'], :]
                print(f"   - Mel spec trimmed to {CONFIG['FIXED_SEQUENCE_LENGTH']} time steps")
            else:
                pad_length = CONFIG['FIXED_SEQUENCE_LENGTH'] - mel_spec.shape[0]
                mel_spec = np.pad(mel_spec, ((0, pad_length), (0, 0)), 'constant')
                print(f"   - Mel spec padded to {CONFIG['FIXED_SEQUENCE_LENGTH']} time steps")
            
            print(f"   - Final mel spectrogram shape: {mel_spec.shape}")
            
            # Check for NaN or Inf values
            if np.isnan(mel_spec).any():
                print("⚠️  WARNING: NaN values detected in mel spectrogram")
            if np.isinf(mel_spec).any():
                print("⚠️  WARNING: Inf values detected in mel spectrogram")
            
            # Visualize mel spectrogram
            plt.figure(figsize=(12, 6))
            librosa.display.specshow(mel_spec.T, sr=CONFIG['SAMPLE_RATE'], 
                                   x_axis='time', y_axis='mel', cmap='viridis')
            plt.title('Mel Spectrogram Features')
            plt.colorbar(format='%+2.0f dB')
            plt.tight_layout()
            plt.show()
            
            return mel_spec
            
        except Exception as e:
            print(f"❌ Error processing audio: {e}")
            import traceback
            traceback.print_exc()
            return np.zeros((CONFIG['FIXED_SEQUENCE_LENGTH'], CONFIG['N_MELS']))
    
    def sequence_to_text(self, sequence: List[int]) -> str:
        """Convert sequence of indices back to text with debugging"""
        text = ""
        print(f"🔤 Converting sequence to text: {sequence[:20]}...")  # Show first 20 indices
        
        for i, idx in enumerate(sequence):
            if idx in self.idx_to_char:
                char = self.idx_to_char[idx]
                if char in ['<pad>', '<sos>', '<eos>']:
                    if char == '<eos>':
                        print(f"   - Found EOS token at position {i}")
                        break
                    continue
                elif char == '<unk>':
                    text += '?'
                    print(f"   - Found UNK token at position {i}")
                else:
                    text += char
            else:
                print(f"   - Unknown index {idx} at position {i}")
                text += '?'
        
        print(f"   - Final text length: {len(text)}")
        return text
    
    def greedy_decode(self, audio_features: np.ndarray) -> str:
        """Simple greedy decoding with enhanced debugging"""
        if self.model is None and self.interpreter is None:
            print("❌ No model loaded")
            return ""
        
        print("\n📝 Using greedy decoding...")
        
        # Prepare inputs
        audio_input = np.expand_dims(audio_features, axis=0).astype(np.float32)
        print(f"   - Audio input shape: {audio_input.shape}")
        
        if self.interpreter is not None:
            # TFLite inference
            print("🚀 Running TFLite inference...")
            decoder_input = np.zeros((1, CONFIG['MAX_TEXT_LENGTH'] - 1), dtype=np.int32)
            decoder_input[0, 0] = self.char_to_idx['<sos>']
            
            print(f"   - Decoder input shape: {decoder_input.shape}")
            print(f"   - SOS token: {self.char_to_idx['<sos>']}")
            
            # Set inputs
            self.interpreter.set_tensor(self.input_details[0]['index'], audio_input)
            self.interpreter.set_tensor(self.input_details[1]['index'], decoder_input)
            
            # Run inference
            self.interpreter.invoke()
            
            # Get output
            prediction = self.interpreter.get_tensor(self.output_details[0]['index'])
            print(f"   - Prediction shape: {prediction.shape}")
            print(f"   - Prediction range: [{prediction.min():.4f}, {prediction.max():.4f}]")
            
            # Get predicted sequence
            predicted_sequence = np.argmax(prediction[0], axis=-1)
            print(f"   - Predicted sequence: {predicted_sequence[:20]}...")
            
            predicted_text = self.sequence_to_text(predicted_sequence)
            
        else:
            # Keras model inference
            print("🚀 Running Keras model inference...")
            decoder_input = np.zeros((1, CONFIG['MAX_TEXT_LENGTH'] - 1))
            decoder_input[0, 0] = self.char_to_idx['<sos>']
            
            print(f"   - Decoder input shape: {decoder_input.shape}")
            print(f"   - SOS token: {self.char_to_idx['<sos>']}")
            
            predicted_text = ""
            
            for i in range(1, CONFIG['MAX_TEXT_LENGTH'] - 1):
                # Get prediction
                prediction = self.model.predict([audio_input, decoder_input], verbose=0)
                print(f"   - Step {i}: Prediction shape: {prediction.shape}")
                
                # Get next token
                next_token = np.argmax(prediction[0, i-1, :])
                print(f"   - Step {i}: Next token: {next_token} ({self.idx_to_char.get(next_token, 'UNK')})")
                
                # Stop if EOS token
                if next_token == self.char_to_idx['<eos>']:
                    print(f"   - EOS token found at step {i}")
                    break
                
                # Add to decoder input for next prediction
                decoder_input[0, i] = next_token
            
            # Convert to text
            predicted_sequence = decoder_input[0, 1:i+1]  # Skip SOS token
            predicted_text = self.sequence_to_text(predicted_sequence.astype(int))
        
        print(f"   - Final predicted text: '{predicted_text}'")
        return predicted_text
    
    def beam_search_decode(self, audio_features: np.ndarray, beam_width: int = 3) -> str:
        """Beam search decoding for better results with debugging"""
        if self.model is None:
            print("❌ Beam search only supported for Keras models")
            return self.greedy_decode(audio_features)
        
        print(f"\n🔍 Using beam search decoding (beam_width={beam_width})...")
        
        audio_input = np.expand_dims(audio_features, axis=0)
        
        # Initialize beam
        beams = [(0.0, [self.char_to_idx['<sos>']])]
        print(f"   - Initial beam: {beams}")
        
        for step in range(CONFIG['MAX_TEXT_LENGTH'] - 1):
            print(f"   - Step {step}: Processing {len(beams)} beams")
            new_beams = []
            
            for beam_idx, (score, sequence) in enumerate(beams):
                if len(sequence) > 0 and sequence[-1] == self.char_to_idx['<eos>']:
                    new_beams.append((score, sequence))
                    continue
                
                # Prepare decoder input
                decoder_input = np.zeros((1, CONFIG['MAX_TEXT_LENGTH'] - 1))
                for i, token in enumerate(sequence):
                    if i < CONFIG['MAX_TEXT_LENGTH'] - 1:
                        decoder_input[0, i] = token
                
                # Get predictions
                predictions = self.model.predict([audio_input, decoder_input], verbose=0)
                
                # Get probabilities for next token
                if len(sequence) - 1 < predictions.shape[1]:
                    next_token_probs = predictions[0, len(sequence) - 1, :]
                    
                    # Get top beam_width predictions
                    top_indices = np.argsort(next_token_probs)[-beam_width:]
                    
                    for idx in top_indices:
                        prob = next_token_probs[idx]
                        if prob > 1e-10:  # Avoid log(0)
                            new_score = score + np.log(prob)
                            new_sequence = sequence + [idx]
                            new_beams.append((new_score, new_sequence))
            
            if not new_beams:
                print(f"   - No valid beams at step {step}")
                break
            
            # Keep only top beam_width beams
            beams = sorted(new_beams, key=lambda x: x[0], reverse=True)[:beam_width]
            
            # Stop if all beams ended
            if all(len(seq) > 0 and seq[-1] == self.char_to_idx['<eos>'] for _, seq in beams):
                print(f"   - All beams ended at step {step}")
                break
        
        # Return best sequence
        if beams:
            best_score, best_sequence = beams[0]
            print(f"   - Best beam score: {best_score}")
            print(f"   - Best sequence: {best_sequence}")
            result = self.sequence_to_text(best_sequence[1:])  # Skip SOS token
            return result
        else:
            print("   - No valid beams found")
            return ""
    
    def test_audio(self, audio_path: str, use_beam_search: bool = True, beam_width: int = 3, 
                   show_audio_info: bool = True) -> str:
        """Test audio file with the model - Enhanced with debugging"""
        if not os.path.exists(audio_path):
            print(f"❌ Audio file not found: {audio_path}")
            return ""
        
        print(f"\n🎯 Testing Sinhala STT Model")
        print("=" * 80)
        
        # Display audio information
        if show_audio_info:
            self.display_audio_info(audio_path)
        
        # Preprocess audio
        audio_features = self.preprocess_audio(audio_path)
        
        if self.model is None and self.interpreter is None:
            print("❌ No model loaded. Cannot perform inference.")
            return ""
        
        # Perform inference
        try:
            if use_beam_search and self.model is not None:
                predicted_text = self.beam_search_decode(audio_features, beam_width)
            else:
                predicted_text = self.greedy_decode(audio_features)
        except Exception as e:
            print(f"❌ Error during inference: {e}")
            import traceback
            traceback.print_exc()
            return ""
        
        # Display results
        print(f"\n🎉 FINAL RESULTS:")
        print("=" * 80)
        print(f"📁 Audio File: {os.path.basename(audio_path)}")
        print(f"🔤 Predicted Text: '{predicted_text}'")
        print(f"📏 Text Length: {len(predicted_text)} characters")
        print(f"📊 Method: {'Beam Search' if use_beam_search and self.model else 'Greedy Decoding'}")
        if use_beam_search and self.model:
            print(f"🔍 Beam Width: {beam_width}")
        print("=" * 80)
        
        # Additional debugging if text is empty
        if not predicted_text.strip():
            print("⚠️  WARNING: Empty transcription result!")
            print("   Possible issues:")
            print("   - Audio file might be silent or too quiet")
            print("   - Model might not be properly trained")
            print("   - Vocabulary mismatch between training and testing")
            print("   - Audio preprocessing might have issues")
            print("   - Model architecture mismatch")
        
        return predicted_text


In [32]:
def main():
    """Enhanced main function with debugging"""
    
    # 🔧 CONFIGURATION - MODIFY THESE PATHS
    MODEL_PATH = "sinhala_stt_model.tflite"  # or "path/to/your/model.tflite"
    # Optional
    AUDIO_PATH = "converted.wav"  # Your test audio file
    
    # 🎛️ SETTINGS
    USE_BEAM_SEARCH = True  # Set to False for faster greedy decoding
    BEAM_WIDTH = 3  # Only used if USE_BEAM_SEARCH is True
    SHOW_AUDIO_INFO = True  # Set to False to skip audio visualization
    
    print("🚀 Initializing Enhanced Sinhala STT Tester...")
    print("=" * 80)
    
    # Initialize tester
    tester = SinhalaSTTTester()
    
    # Check file paths
    print(f"\n📂 Checking file paths:")
    print(f"   - Model path: {MODEL_PATH}")
    print(f"   - Model exists: {os.path.exists(MODEL_PATH) if MODEL_PATH != 'path/to/your/model.h5' else 'Please set MODEL_PATH'}")
    print(f"   - Vocab path: {VOCAB_PATH}")
    print(f"   - Vocab exists: {os.path.exists(VOCAB_PATH) if VOCAB_PATH != 'path/to/your/vocabulary.pkl' else 'Please set VOCAB_PATH'}")
    print(f"   - Audio path: {AUDIO_PATH}")
    print(f"   - Audio exists: {os.path.exists(AUDIO_PATH) if AUDIO_PATH != 'path/to/your/audio.wav' else 'Please set AUDIO_PATH'}")
    
    # Load vocabulary (optional)
    if VOCAB_PATH and os.path.exists(VOCAB_PATH):
        success = tester.load_vocabulary(VOCAB_PATH)
        if not success:
            print("⚠️  Warning: Failed to load vocabulary, using basic vocabulary")
    else:
        print("ℹ️  Using basic vocabulary (vocabulary file not provided)")
    
    # Load model
    if MODEL_PATH and os.path.exists(MODEL_PATH):
        success = tester.load_model(MODEL_PATH)
        if not success:
            print("❌ Failed to load model. Exiting.")
            return
    else:
        print(f"❌ Model file not found: {MODEL_PATH}")
        print("Please update the MODEL_PATH variable with the correct path.")
        return
    
    # Test audio
    if AUDIO_PATH and os.path.exists(AUDIO_PATH):
        result = tester.test_audio(
            audio_path=AUDIO_PATH,
            use_beam_search=USE_BEAM_SEARCH,
            beam_width=BEAM_WIDTH,
            show_audio_info=SHOW_AUDIO_INFO
        )
        
        if result.strip():
            print(f"\n✅ SUCCESS: Transcription obtained!")
            print(f"Final result: '{result}'")
        else:
            print(f"\n❌ ISSUE: Empty transcription result")
            print("Check the debug output above for potential issues")
        
    else:
        print(f"❌ Audio file not found: {AUDIO_PATH}")
        print("Please update the AUDIO_PATH variable with the correct path.")
        return
    
    print("\n🏁 Testing completed!")



In [31]:
def quick_debug():
    """Quick debugging function to test individual components"""
    
    print("🔍 Quick Debug Mode")
    print("=" * 50)
    
    # Test 1: Check TensorFlow installation
    print(f"1. TensorFlow version: {tf.__version__}")
    
    # Test 2: Check audio libraries
    try:
        import librosa
        print(f"2. Librosa version: {librosa.__version__}")
    except ImportError:
        print("2. ❌ Librosa not installed")
    
    # Test 3: Basic vocabulary creation
    tester = SinhalaSTTTester()
    print(f"3. Vocabulary size: {tester.vocab_size}")
    
    # Test 4: Audio file check (if path provided)
    audio_path = "path/to/your/audio.wav"  # Update this
    if os.path.exists(audio_path):
        print(f"4. Audio file found: {audio_path}")
        tester.display_audio_info(audio_path)
    else:
        print(f"4. Audio file not found: {audio_path}")


In [33]:
import os
print("Current directory:", os.getcwd())
print("Files in directory:", os.listdir('.'))

Current directory: c:\Users\User\Desktop\notbook stt mode
Files in directory: ['audio_augmented', 'best_sinhala_stt_model.keras', 'converted.wav', 'converted_audio_new', 'file.ipynb', 'final_txt_data.json', 'nimal_hotel.mp3', 'sinhala_stt_model.h5', 'sinhala_stt_model.keras', 'sinhala_stt_model.tflite', 'sinhala_stt_processor.pkl', 'test.ipynb', 'train.ipynb']


In [6]:
import numpy as np
import librosa
import tensorflow as tf

# === Config ===
MODEL_PATH = "sinhala_stt_model.h5"
AUDIO_PATH = "converted.wav"  # Replace with your .wav file
TARGET_SR = 16000              # Sampling rate used during training
EXPECTED_INPUT_LEN = 99        # Model expects input shape [1, 99]

# === Step 1: Load and Preprocess Audio ===
def preprocess_audio_to_mfcc_1d(audio_path, target_sr=16000, expected_len=99):
    audio, sr = librosa.load(audio_path, sr=target_sr)
    audio = librosa.util.normalize(audio)
    
    # Extract MFCCs (use only 1 coefficient per frame to match [1, 99])
    mfcc = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=1)  # shape: [1, time]
    
    # Pad or trim to expected length
    mfcc = mfcc[0]  # flatten from shape (1, time) → (time,)
    if len(mfcc) < expected_len:
        mfcc = np.pad(mfcc, (0, expected_len - len(mfcc)), mode='constant')
    else:
        mfcc = mfcc[:expected_len]
    
    return mfcc[np.newaxis, :].astype(np.float32)  # shape: [1, 99]

# === Step 2: Load TFLite Model ===
def load_tflite_model(model_path):
    interpreter = tf.lite.Interpreter(model_path=model_path)
    interpreter.allocate_tensors()
    return interpreter

# === Step 3: Run Inference ===
def run_inference(interpreter, input_data):
    input_details = interpreter.get_input_details()
    output_details = interpreter.get_output_details()

    interpreter.set_tensor(input_details[0]['index'], input_data)
    interpreter.invoke()

    output_data = interpreter.get_tensor(output_details[0]['index'])
    return output_data

# === Step 4: Decode Output (Dummy Mapping) ===
# Replace this with your actual index-to-character mapping
index_to_char = {
    0: 'අ', 1: 'ම', 2: 'ට', 3: ' ', 4: 'න', 5: 'ි', 6: 'ල', 7: 'ය', 8: 'හ', 9: 'ෝ', 10: ''
}

def decode_output(output, mapping):
    output_indices = np.argmax(output, axis=-1)
    return ''.join([mapping.get(i, '') for i in output_indices[0]])

# === RUN ===
mfcc_input = preprocess_audio_to_mfcc_1d(AUDIO_PATH)
interpreter = load_tflite_model(MODEL_PATH)
output = run_inference(interpreter, mfcc_input)
transcription = decode_output(output, index_to_char)

print("🗣️ Transcription:", transcription)


ValueError: Model provided has model identifier '

', should be 'TFL3'


In [35]:
input_details = interpreter.get_input_details()
print("Expected input shape:", input_details[0]['shape'])


Expected input shape: [ 1 99]
