In [None]:
import os
import json
import librosa
import numpy as np
import tensorflow as tf
from keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import LSTM, Dense, Masking
import pickle
# Load the JSON data
with open('alafasy_data.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

def load_audio_features(audio_path, max_len=100):
    """Load audio file and extract MFCC features."""
    try:
        y, sr = librosa.load(audio_path, sr=None)  # Load the audio file
        if len(y) == 0:
            raise ValueError("Empty audio file")
        mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)  # Extract MFCC features
        if mfccs.shape[1] < max_len:
            padded_mfccs = np.pad(mfccs, ((0, 0), (0, max_len - mfccs.shape[1])), mode='constant')
        else:
            padded_mfccs = mfccs[:, :max_len]
        return padded_mfccs.T  # Transpose to get (time, feature) shape
    except Exception as e:
        print(f"Error loading {audio_path}: {e}")
        return np.zeros((max_len, 13))

# Prepare input-output pairs
inputs = []
outputs = []

max_len = 100  # Fixed max length for MFCC features

for verse in data:
    for word_audio in verse['words_audios']:
        audio_path = word_audio['audio_path']
        word = word_audio['word']

        if os.path.exists(audio_path):
            print(f"Loading audio: {audio_path} + {word}")
            mfcc_features = load_audio_features(audio_path, max_len)
            inputs.append(mfcc_features)
            outputs.append(word)
# Pad sequences to the same length
inputs_padded = np.array(inputs)

# Convert words to numerical labels
label_encoder = LabelEncoder()
outputs_encoded = label_encoder.fit_transform(outputs)

# One-hot encode the labels
outputs_one_hot = to_categorical(outputs_encoded)


In [None]:


# Define the model
model = Sequential([
    Masking(mask_value=0.0, input_shape=(max_len, 13)),  # Masking layer for padded inputs
    LSTM(128, return_sequences=False),  # LSTM layer
    Dense(len(label_encoder.classes_), activation='softmax')  # Output layer
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(inputs_padded, outputs_one_hot, epochs=50, batch_size=16, validation_split=0.2)

# Save the model and label encoder
model.save('quran_recitation_model.keras')

with open('label_encoder.pkl', 'wb') as f:
    pickle.dump(label_encoder, f)

In [None]:

# Load the trained model
model = tf.keras.models.load_model('quran_recitation_model.keras')

# Load the label encoder
with open('label_encoder.pkl', 'rb') as f:
    label_encoder = pickle.load(f)
label_encoder

In [None]:
def load_audio_features(audio_path, max_len=100):
    """Load audio file and extract MFCC features."""
    try:
        y, sr = librosa.load(audio_path, sr=None)  # Load the audio file
        if len(y) == 0:
            raise ValueError("Empty audio file")
        mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)  # Extract MFCC features
        if mfccs.shape[1] < max_len:
            padded_mfccs = np.pad(mfccs, ((0, 0), (0, max_len - mfccs.shape[1])), mode='constant')
        else:
            padded_mfccs = mfccs[:, :max_len]
        return padded_mfccs.T  # Transpose to get (time, feature) shape
    except Exception as e:
        print(f"Error loading {audio_path}: {e}")
        return np.zeros((max_len, 13))


In [None]:
def predict_word(audio_path):
    """Predict the word from an audio file."""
    # Load audio features
    mfcc_features = load_audio_features(audio_path)
    # Expand dimensions to match model input
    mfcc_features = np.expand_dims(mfcc_features, axis=0)
    # Predict using the model
    predictions = model.predict(mfcc_features)
    # Get the predicted label
    predicted_label_index = np.argmax(predictions, axis=1)[0]
    predicted_word = label_encoder.inverse_transform([predicted_label_index])[0]
    return predicted_word


In [None]:
# Path to your voice input file
# voice_input_path = 'output_folder/surah_1/ayah_2/1.wav'
voice_input_path = 'recorded_audio.wav'

# Make a prediction 
predicted_word = predict_word(voice_input_path)
print(f"Predicted word: {predicted_word}")
