In [58]:

import json
import os
import librosa
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

In [59]:
# Configuration
AUDIO_DIR = "audio_augmented"  # Directory containing WAV files
SAMPLE_RATE = 16000
MAX_AUDIO_LENGTH = 5000    # Max audio duration in ms (5 seconds)
N_MFCC = 13
MAX_TEXT_LENGTH = 50 

In [60]:
def load_data(json_path):
    """Load JSON data file"""
    try:
        with open(json_path) as f:
            data = json.load(f)
        print(f"Loaded {len(data)} entries from {json_path}")
        return data
    except Exception as e:
        print(f"Error loading data: {e}")
        return {}


In [61]:
def extract_mfcc(filename, max_pad_len=100):
    """Extract MFCC features from audio file"""
    filepath = os.path.join(AUDIO_DIR, filename)
    
    try:
        y, sr = librosa.load(filepath, sr=SAMPLE_RATE)
        
        # Ensure audio is 5 seconds (pad/trim)
        max_len = int(SAMPLE_RATE * (MAX_AUDIO_LENGTH / 1000))
        if len(y) > max_len:
            y = y[:max_len]
        else:
            y = np.pad(y, (0, max(0, max_len - len(y))), 'constant')
        
        # Extract MFCC features
        mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=N_MFCC)
        mfccs = np.transpose(mfccs)  # Shape: (time, n_mfcc)
        
        # Pad/trim to fixed timesteps
        if mfccs.shape[0] > max_pad_len:
            mfccs = mfccs[:max_pad_len, :]
        else:
            pad_width = [(0, max_pad_len - mfccs.shape[0]), (0, 0)]
            mfccs = np.pad(mfccs, pad_width, mode='constant')
        
        return mfccs
    except Exception as e:
        print(f"Error extracting MFCC from {filename}: {e}")
        return None

In [62]:

def create_vocab(data):
    """Create character vocabulary from text data"""
    chars = set()
    for item in data.values():
        chars.update(item['text'])
    
    # Sort characters for consistency
    sorted_chars = sorted(chars)
    char_to_num = {char: idx + 1 for idx, char in enumerate(sorted_chars)}  # 0 reserved for padding
    char_to_num['<BLANK>'] = 0  # CTC blank token
    num_to_char = {idx: char for char, idx in char_to_num.items()}
    
    print(f"Vocabulary size: {len(char_to_num)} characters")
    print(f"Characters: {sorted_chars[:20]}...")  # Show first 20 chars
    
    return char_to_num, num_to_char


In [63]:
def prepare_dataset(data, char_to_num, max_samples=None):
    """Prepare dataset for training"""
    X = []
    y = []
    skipped_files = []
    processed_count = 0

    items = list(data.items())
    if max_samples:
        items = items[:max_samples]

    print(f"Processing {len(items)} audio files...")
    
    for i, (key, item) in enumerate(items):
        if i % 100 == 0:
            print(f"Processed {i}/{len(items)} files...")
            
        filepath = os.path.join(AUDIO_DIR, item['newfn'])
        
        # Skip if audio file doesn't exist
        if not os.path.exists(filepath):
            skipped_files.append((key, f"File not found: {item['newfn']}"))
            continue
            
        # Skip if text is too long
        if len(item['text']) > MAX_TEXT_LENGTH:
            skipped_files.append((key, f"Text too long: {len(item['text'])} > {MAX_TEXT_LENGTH}"))
            continue
            
        try:
            # Extract audio features
            mfcc = extract_mfcc(item['newfn'])
            if mfcc is None:
                skipped_files.append((key, "Failed to extract MFCC"))
                continue
                
            # Encode text
            text_encoded = []
            for char in item['text']:
                if char in char_to_num:
                    text_encoded.append(char_to_num[char])
                else:
                    # Skip unknown characters or replace with blank
                    continue
                    
            if len(text_encoded) == 0:
                skipped_files.append((key, "No valid characters in text"))
                continue
                
            X.append(mfcc)
            y.append(text_encoded)
            processed_count += 1
            
        except Exception as e:
            skipped_files.append((key, f"Processing error: {str(e)}"))
            continue

    if len(X) == 0:
        print("ERROR: No valid samples found!")
        print("Skipped files:")
        for key, reason in skipped_files[:10]:  # Show first 10 errors
            print(f"  {key}: {reason}")
        return np.array([]), np.array([])

    # Convert to numpy arrays
    X = np.array(X)
    
    # Pad text sequences
    y_padded = pad_sequences(y, padding='post', value=0, maxlen=MAX_TEXT_LENGTH)
    
    print(f"Successfully processed {processed_count} files")
    print(f"Skipped {len(skipped_files)} files")
    print(f"Final shapes: X={X.shape}, y={y_padded.shape}")
    
    if len(skipped_files) > 0:
        print("\nFirst few skipped files:")
        for key, reason in skipped_files[:5]:
            print(f"  {key}: {reason}")
    
    return X, y_padded

In [64]:
def build_model(input_dim, output_dim, rnn_units=128):
    """Build CTC model for speech recognition"""
    
    # Input layer
    input_data = layers.Input(shape=(None, input_dim), name='input')
    
    # Convolutional layers for feature extraction
    x = layers.Conv1D(64, 3, activation='relu', padding='same')(input_data)
    x = layers.BatchNormalization()(x)
    x = layers.Dropout(0.2)(x)
    x = layers.MaxPooling1D(2)(x)
    
    x = layers.Conv1D(128, 3, activation='relu', padding='same')(x)
    x = layers.BatchNormalization()(x)
    x = layers.Dropout(0.2)(x)
    
    # Recurrent layers
    x = layers.Bidirectional(layers.LSTM(rnn_units, return_sequences=True, dropout=0.2))(x)
    x = layers.Bidirectional(layers.LSTM(rnn_units, return_sequences=True, dropout=0.2))(x)
    
    # Dense layer before output
    x = layers.Dense(256, activation='relu')(x)
    x = layers.Dropout(0.2)(x)
    
    # Output layer (characters + blank token)
    output = layers.Dense(output_dim, activation='softmax', name='output')(x)
    
    model = models.Model(inputs=input_data, outputs=output)
    
    # Custom CTC loss function
    def ctc_loss_func(y_true, y_pred):
        batch_len = tf.cast(tf.shape(y_true)[0], dtype="int64")
        input_length = tf.cast(tf.shape(y_pred)[1], dtype="int64")
        label_length = tf.cast(tf.shape(y_true)[1], dtype="int64")
        
        input_length = input_length * tf.ones(shape=(batch_len, 1), dtype="int64")
        label_length = label_length * tf.ones(shape=(batch_len, 1), dtype="int64")
        
        loss = tf.keras.backend.ctc_batch_cost(y_true, y_pred, input_length, label_length)
        return loss
    
    # Compile model
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
        loss=ctc_loss_func,
        metrics=['accuracy']
    )
    
    return model


In [65]:
def main():
    """Main training function"""
    
    # Ensure sklearn is imported (re-import to be safe)
    try:
        from sklearn.model_selection import train_test_split
    except ImportError:
        print("Error: sklearn not installed. Run: pip install scikit-learn")
        return
    
    # Check if data file exists
    json_path = "final_txt_data.json"
    if not os.path.exists(json_path):
        print(f"Error: {json_path} not found!")
        return
    
    # Check if audio directory exists
    if not os.path.exists(AUDIO_DIR):
        print(f"Error: Audio directory {AUDIO_DIR} not found!")
        return
    
    print("Loading data...")
    data = load_data(json_path)
    if not data:
        print("No data loaded!")
        return
    
    print("Creating vocabulary...")
    char_to_num, num_to_char = create_vocab(data)
    
    print("Preparing dataset...")
    # Limit samples for testing (remove max_samples=100 for full dataset)
    X, y = prepare_dataset(data, char_to_num, max_samples=100)
    
    if len(X) == 0:
        print("No valid samples found. Please check your data and audio files.")
        return
    
    # Check data shapes
    print(f"Dataset shapes: X={X.shape}, y={y.shape}")
    
    if len(X) < 10:
        print(f"Warning: Only {len(X)} samples found. Need more data for training.")
        return
    
    # Split dataset
    print("Splitting dataset...")
    X_train, X_val, y_train, y_val = train_test_split(
        X, y, test_size=0.2, random_state=42, shuffle=True
    )
    
    print(f"Training set: {X_train.shape[0]} samples")
    print(f"Validation set: {X_val.shape[0]} samples")
    
    # Build model
    input_dim = X.shape[2]  # Number of MFCC features
    output_dim = len(char_to_num)  # Number of unique characters including blank
    
    print(f"Building model with input_dim={input_dim}, output_dim={output_dim}")
    model = build_model(input_dim, output_dim)
    
    # Print model summary
    model.summary()
    
    # Training callbacks
    callbacks = [
        tf.keras.callbacks.EarlyStopping(
            monitor='val_loss',
            patience=5,
            restore_best_weights=True
        ),
        tf.keras.callbacks.ReduceLROnPlateau(
            monitor='val_loss',
            factor=0.5,
            patience=3,
            min_lr=0.00001
        )
    ]
    
    # Train model
    print("Starting training...")
    try:
        history = model.fit(
            X_train, y_train,
            validation_data=(X_val, y_val),
            epochs=20,
            batch_size=8,
            callbacks=callbacks,
            verbose=1
        )
        
        # Save model
        print("Saving model...")
        model.save('sinhala_stt_model.h5')
        
        # Convert to TFLite (with fixes for LSTM compatibility)
        print("Converting to TFLite...")
        try:
            converter = tf.lite.TFLiteConverter.from_keras_model(model)
            
            # Fix for LSTM/RNN conversion issues
            converter.optimizations = [tf.lite.Optimize.DEFAULT]
            converter.target_spec.supported_ops = [
                tf.lite.OpsSet.TFLITE_BUILTINS,
                tf.lite.OpsSet.SELECT_TF_OPS  # Allow TensorFlow ops
            ]
            converter._experimental_lower_tensor_list_ops = False
            converter.experimental_enable_resource_variables = True
            
            tflite_model = converter.convert()
            
            with open('sinhala_stt.tflite', 'wb') as f:
                f.write(tflite_model)
            
            print("TFLite conversion successful!")
            
        except Exception as tflite_error:
            print(f"TFLite conversion failed: {tflite_error}")
            print("Saving model in SavedModel format instead...")
            
            # Alternative: Save as SavedModel format
            model.save('sinhala_stt_savedmodel', save_format='tf')
            print("Model saved as SavedModel format (sinhala_stt_savedmodel/)")
        
        # Save vocabulary
        vocab_data = {
            'char_to_num': char_to_num,
            'num_to_char': num_to_char
        }
        with open('vocabulary.json', 'w', encoding='utf-8') as f:
            json.dump(vocab_data, f, ensure_ascii=False, indent=2)
        
        print("Training completed successfully!")
        print("Files saved:")
        print("  - sinhala_stt_model.h5 (Keras model)")
        if os.path.exists('sinhala_stt.tflite'):
            print("  - sinhala_stt.tflite (TensorFlow Lite model)")
        if os.path.exists('sinhala_stt_savedmodel'):
            print("  - sinhala_stt_savedmodel/ (SavedModel format)")
        print("  - vocabulary.json (Character vocabulary)")
        
    except Exception as e:
        print(f"Training failed: {e}")
        import traceback
        traceback.print_exc()

if __name__ == "__main__":
    main()


Loading data...
Loaded 5364 entries from final_txt_data.json
Creating vocabulary...
Vocabulary size: 87 characters
Characters: [' ', '!', "'", '(', ')', ',', '-', '.', ':', ';', '?', 'ං', 'ඃ', 'අ', 'ආ', 'ඇ', 'ඈ', 'ඉ', 'ඊ', 'උ']...
Preparing dataset...
Processing 100 audio files...
Processed 0/100 files...
Successfully processed 19 files
Skipped 81 files
Final shapes: X=(19, 100, 13), y=(19, 50)

First few skipped files:
  1: Text too long: 60 > 50
  2: Text too long: 52 > 50
  3: Text too long: 61 > 50
  4: Text too long: 137 > 50
  5: Text too long: 62 > 50
Dataset shapes: X=(19, 100, 13), y=(19, 50)
Splitting dataset...
Training set: 15 samples
Validation set: 4 samples
Building model with input_dim=13, output_dim=87


Starting training...
Epoch 1/20
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 1s/step - accuracy: 0.0182 - loss: inf - val_accuracy: 0.0200 - val_loss: inf - learning_rate: 0.0010
Epoch 2/20
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 73ms/step - accuracy: 0.0324 - loss: inf - val_accuracy: 0.0200 - val_loss: inf - learning_rate: 0.0010
Epoch 3/20
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 74ms/step - accuracy: 0.0514 - loss: inf - val_accuracy: 0.0200 - val_loss: inf - learning_rate: 0.0010
Epoch 4/20
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 73ms/step - accuracy: 0.0730 - loss: inf - val_accuracy: 0.0300 - val_loss: inf - learning_rate: 5.0000e-04
Epoch 5/20
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 72ms/step - accuracy: 0.0723 - loss: inf - val_accuracy: 0.0500 - val_loss: inf - learning_rate: 5.0000e-04
Epoch 6/20
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 73ms/step - acc



Saving model...
Converting to TFLite...
INFO:tensorflow:Assets written to: C:\Users\User\AppData\Local\Temp\tmpn5b8pbdt\assets


INFO:tensorflow:Assets written to: C:\Users\User\AppData\Local\Temp\tmpn5b8pbdt\assets


Saved artifact at 'C:\Users\User\AppData\Local\Temp\tmpn5b8pbdt'. The following endpoints are available:

* Endpoint 'serve'
  args_0 (POSITIONAL_ONLY): TensorSpec(shape=(None, None, 13), dtype=tf.float32, name='input')
Output Type:
  TensorSpec(shape=(None, None, 87), dtype=tf.float32, name=None)
Captures:
  2359596581520: TensorSpec(shape=(), dtype=tf.resource, name=None)
  2359596582864: TensorSpec(shape=(), dtype=tf.resource, name=None)
  2359596583248: TensorSpec(shape=(), dtype=tf.resource, name=None)
  2359596582480: TensorSpec(shape=(), dtype=tf.resource, name=None)
  2359596582288: TensorSpec(shape=(), dtype=tf.resource, name=None)
  2359596583056: TensorSpec(shape=(), dtype=tf.resource, name=None)
  2359596584784: TensorSpec(shape=(), dtype=tf.resource, name=None)
  2359596585552: TensorSpec(shape=(), dtype=tf.resource, name=None)
  2359596585936: TensorSpec(shape=(), dtype=tf.resource, name=None)
  2359596585168: TensorSpec(shape=(), dtype=tf.resource, name=None)
  235959658