In [15]:
import os
import json
import librosa
import numpy as np
import tensorflow as tf
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping, LearningRateScheduler
from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import LSTM, Dense, Masking
import pickle

In [16]:

# Load the JSON data
with open('tajweed_data.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

def load_audio_features(audio_path, max_len=100):
    """Load audio file and extract MFCC features."""
    try:
        y, sr = librosa.load(audio_path, sr=None)  # Load the audio file
        if len(y) == 0:
            raise ValueError("Empty audio file")
        mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)  # Extract MFCC features
        if mfccs.shape[1] < max_len:
            padded_mfccs = np.pad(mfccs, ((0, 0), (0, max_len - mfccs.shape[1])), mode='constant')
        else:
            padded_mfccs = mfccs[:, :max_len]
        return padded_mfccs.T  # Transpose to get (time, feature) shape
    except Exception as e:
        print(f"Error loading {audio_path}: {e}")
        return np.zeros((max_len, 13))

# Prepare input-output pairs
inputs = []
outputs = []

max_len = 100  # Fixed max length for MFCC features

for verse in data:
    if verse.get("surah_number") == 2:
        break
    
    for word_audio in verse['words_audios']:
        audio_path = word_audio['audio_path']
        word = word_audio['word']

        if os.path.exists(audio_path):
            print(f"Loading audio: {audio_path} + {word}")
            mfcc_features = load_audio_features(audio_path, max_len)
            inputs.append(mfcc_features)
            outputs.append(word)
# Pad sequences to the same length
inputs_padded = np.array(inputs)

# Convert words to numerical labels
label_encoder = LabelEncoder()
outputs_encoded = label_encoder.fit_transform(outputs)

# One-hot encode the labels
outputs_one_hot = to_categorical(outputs_encoded)


Loading audio: word_by_word/tajweed/surah_1/001001001.mp3 + بِسۡمِ
Loading audio: word_by_word/tajweed/surah_1/001001002.mp3 + ٱللَّهِ
Loading audio: word_by_word/tajweed/surah_1/001001003.mp3 + ٱلرَّحۡمَٰنِ
Loading audio: word_by_word/tajweed/surah_1/001001004.mp3 + ٱلرَّحِيمِ
Loading audio: word_by_word/tajweed/surah_1/001002001.mp3 + ٱلۡحَمۡدُ
Loading audio: word_by_word/tajweed/surah_1/001002002.mp3 + لِلَّهِ
Loading audio: word_by_word/tajweed/surah_1/001002003.mp3 + رَبِّ
Loading audio: word_by_word/tajweed/surah_1/001002004.mp3 + ٱلۡعَٰلَمِينَ
Loading audio: word_by_word/tajweed/surah_1/001003001.mp3 + ٱلرَّحۡمَٰنِ
Loading audio: word_by_word/tajweed/surah_1/001003002.mp3 + ٱلرَّحِيمِ
Loading audio: word_by_word/tajweed/surah_1/001004001.mp3 + مَٰلِكِ
Loading audio: word_by_word/tajweed/surah_1/001004002.mp3 + يَوۡمِ
Loading audio: word_by_word/tajweed/surah_1/001004003.mp3 + ٱلدِّينِ
Loading audio: word_by_word/tajweed/surah_1/001005001.mp3 + إِيَّاكَ
Loading audio: word_by_wor

In [17]:
# Check the shapes of the inputs
print(f"Shape of inputs_padded: {inputs_padded.shape}")
print(f"Shape of outputs_one_hot: {outputs_one_hot.shape}")


Shape of inputs_padded: (29, 100, 13)
Shape of outputs_one_hot: (29, 26)


In [21]:


# Define the model
model = Sequential([
    # Masking layer for padded inputs
    Masking(mask_value=0.0, input_shape=(max_len, 13)),
    LSTM(128, return_sequences=False),  # LSTM layer
    Dense(len(label_encoder.classes_), activation='softmax')  # Output layer
])

model.compile(optimizer='adam', loss='categorical_crossentropy',
              metrics=['accuracy'])
callable = [EarlyStopping(monitor='val_loss', patience=15, restore_best_weights=True,
                          )]
# Train the model
model.fit(inputs_padded, outputs_one_hot, epochs=50,
          batch_size=16, validation_split=0.2, callbacks=callable
          )

# Save the model and label encoder
model.save('quran_recitation_model.keras')

with open('label_encoder.pkl', 'wb') as f:
    pickle.dump(label_encoder, f)

Epoch 1/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 263ms/step - accuracy: 0.0000e+00 - loss: 3.5017 - val_accuracy: 0.0000e+00 - val_loss: 3.4898
Epoch 2/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step - accuracy: 0.0290 - loss: 3.2433 - val_accuracy: 0.0000e+00 - val_loss: 3.5719
Epoch 3/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step - accuracy: 0.1866 - loss: 3.0513 - val_accuracy: 0.0000e+00 - val_loss: 3.6013
Epoch 4/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step - accuracy: 0.2074 - loss: 2.8679 - val_accuracy: 0.0000e+00 - val_loss: 3.6385
Epoch 5/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step - accuracy: 0.2364 - loss: 2.7689 - val_accuracy: 0.0000e+00 - val_loss: 3.6654
Epoch 6/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 55ms/step - accuracy: 0.3071 - loss: 2.6280 - val_accuracy: 0.0000e+00 - val_loss: 3.6463
Epoch 7/50
[1m2/

In [22]:

# Load the trained model
model = tf.keras.models.load_model('quran_recitation_model.keras')

# Load the label encoder
with open('label_encoder.pkl', 'rb') as f:
    label_encoder = pickle.load(f)
label_encoder

In [24]:
def load_audio_features(audio_path, max_len=100):
    """Load audio file and extract MFCC features."""
    try:
        y, sr = librosa.load(audio_path, sr=None)  # Load the audio file
        if len(y) == 0:
            raise ValueError("Empty audio file")
        mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)  # Extract MFCC features
        if mfccs.shape[1] < max_len:
            padded_mfccs = np.pad(mfccs, ((0, 0), (0, max_len - mfccs.shape[1])), mode='constant')
        else:
            padded_mfccs = mfccs[:, :max_len]
        return padded_mfccs.T  # Transpose to get (time, feature) shape
    except Exception as e:
        print(f"Error loading {audio_path}: {e}")
        return np.zeros((max_len, 13))


In [25]:
def predict_word(audio_path):
    """Predict the word from an audio file."""
    # Load audio features
    mfcc_features = load_audio_features(audio_path)
    # Expand dimensions to match model input
    mfcc_features = np.expand_dims(mfcc_features, axis=0)
    # Predict using the model
    predictions = model.predict(mfcc_features)
    # Get the predicted label
    predicted_label_index = np.argmax(predictions, axis=1)[0]
    predicted_word = label_encoder.inverse_transform([predicted_label_index])[0]
    return predicted_word


In [32]:
# Path to your voice input file
# voice_input_path = 'output_folder/surah_1/ayah_2/1.wav'
voice_input_path = 'word_by_word/tajweed/surah_1/001001001.mp3'

# Make a prediction 
predicted_word = predict_word(voice_input_path)
print(f"Predicted word: {predicted_word}")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
Predicted word: ٱلَّذِينَ


# Ahay By Ayah Model with Dict

In [36]:
import librosa
import numpy as np
frame_length = 2048
hop_length = 512
def load_audio_features(audio_path, max_len=100):
    """Load audio file and extract MFCC features."""
    try:
        y, sr = librosa.load(audio_path, sr=None)  # Load the audio file
        if len(y) == 0:
            raise ValueError("Empty audio file")
        
        max_volume = np.max(np.abs(y))
        if max_volume > 0:
            audio_normalized = y / max_volume
        else:
            audio_normalized = y
        mfccs = librosa.feature.mfcc(y=audio_normalized, sr=sr, n_mfcc=13,n_fft=frame_length, hop_length=hop_length)  # Extract MFCC features
        # if mfccs.shape[1] < max_len:
        #     padded_mfccs = np.pad(mfccs, ((0, 0), (0, max_len - mfccs.shape[1])), mode='constant')
        # else:
        #     padded_mfccs = mfccs[:, :max_len]
        return mfccs.T  # Transpose to get (time, feature) shape
    except Exception as e:
        print(f"Error loading {audio_path}: {e}")
        return np.zeros((max_len, 13))


In [37]:
import json
# Load the JSON data
with open('ayahs.json', 'r', encoding='utf-8') as f:
    ayahs = json.load(f)


In [38]:
import os

baseDirectory = 'wav_audio_files'
audio_mfccs = []
for root, dirs, files in os.walk(baseDirectory):
    for file in files:
        if file.endswith(".wav"):
            audio_path = os.path.join(root, file)
            if 'surah_1' == root.split('\\')[-1]:
                mfcc_features = load_audio_features(audio_path)
                audio_mfccs.append({
                    "mfcc_features":  mfcc_features,
                     "surah" : int(root.split('\\')[-1].split('_')[-1]),
                     "ayah" : int(file.split('.')[0][-3:])
                     })
                print(f"add mfcc audio to dict: {audio_path}")

add mfcc audio to dict: wav_audio_files\alafasy\surah_1\001001.wav
add mfcc audio to dict: wav_audio_files\alafasy\surah_1\001002.wav
add mfcc audio to dict: wav_audio_files\alafasy\surah_1\001003.wav
add mfcc audio to dict: wav_audio_files\alafasy\surah_1\001004.wav
add mfcc audio to dict: wav_audio_files\alafasy\surah_1\001005.wav
add mfcc audio to dict: wav_audio_files\alafasy\surah_1\001006.wav
add mfcc audio to dict: wav_audio_files\alafasy\surah_1\001007.wav
add mfcc audio to dict: wav_audio_files\husary\surah_1\001001.wav
add mfcc audio to dict: wav_audio_files\husary\surah_1\001002.wav
add mfcc audio to dict: wav_audio_files\husary\surah_1\001003.wav
add mfcc audio to dict: wav_audio_files\husary\surah_1\001004.wav
add mfcc audio to dict: wav_audio_files\husary\surah_1\001005.wav
add mfcc audio to dict: wav_audio_files\husary\surah_1\001006.wav
add mfcc audio to dict: wav_audio_files\husary\surah_1\001007.wav


In [39]:
# add ayah text to audio_mfccs
for ayah in ayahs:
    for audio_mfcc in audio_mfccs:
        if ayah['surah_number'] == audio_mfcc['surah'] and ayah['verse_number'] == audio_mfcc['ayah']:
            audio_mfcc['ayah_text'] = ayah['content']
            print(f"add ayah text to audio_mfccs: {ayah['content']}")

add ayah text to audio_mfccs: بِسۡمِ ٱللَّهِ ٱلرَّحۡمَٰنِ ٱلرَّحِيمِ
add ayah text to audio_mfccs: بِسۡمِ ٱللَّهِ ٱلرَّحۡمَٰنِ ٱلرَّحِيمِ
add ayah text to audio_mfccs: ٱلۡحَمۡدُ لِلَّهِ رَبِّ ٱلۡعَٰلَمِينَ
add ayah text to audio_mfccs: ٱلۡحَمۡدُ لِلَّهِ رَبِّ ٱلۡعَٰلَمِينَ
add ayah text to audio_mfccs: ٱلرَّحۡمَٰنِ ٱلرَّحِيمِ
add ayah text to audio_mfccs: ٱلرَّحۡمَٰنِ ٱلرَّحِيمِ
add ayah text to audio_mfccs: مَٰلِكِ يَوۡمِ ٱلدِّينِ
add ayah text to audio_mfccs: مَٰلِكِ يَوۡمِ ٱلدِّينِ
add ayah text to audio_mfccs: إِيَّاكَ نَعۡبُدُ وَإِيَّاكَ نَسۡتَعِينُ
add ayah text to audio_mfccs: إِيَّاكَ نَعۡبُدُ وَإِيَّاكَ نَسۡتَعِينُ
add ayah text to audio_mfccs: ٱهۡدِنَا ٱلصِّرَٰطَ ٱلۡمُسۡتَقِيمَ
add ayah text to audio_mfccs: ٱهۡدِنَا ٱلصِّرَٰطَ ٱلۡمُسۡتَقِيمَ
add ayah text to audio_mfccs: صِرَٰطَ ٱلَّذِينَ أَنۡعَمۡتَ عَلَيۡهِمۡ غَيۡرِ ٱلۡمَغۡضُوبِ عَلَيۡهِمۡ وَلَا ٱلضَّآلِّينَ
add ayah text to audio_mfccs: صِرَٰطَ ٱلَّذِينَ أَنۡعَمۡتَ عَلَيۡهِمۡ غَيۡرِ ٱلۡمَغۡضُوبِ عَلَيۡهِمۡ وَلَا ٱلضَّآلِّ

In [41]:
audio_mfccs[0]

{'mfcc_features': array([[-431.21408  ,  116.135376 ,   45.372055 , ...,   -5.017194 ,
          -21.09673  ,  -25.217861 ],
        [-414.73224  ,  119.5349   ,   34.47957  , ...,   -2.7899337,
          -19.247639 ,  -25.726654 ],
        [-426.96448  ,  105.60271  ,   28.291512 , ...,   -7.5280175,
          -19.526176 ,  -21.729227 ],
        ...,
        [-422.82968  ,  119.113106 ,   39.078217 , ...,   -9.655136 ,
          -24.388931 ,  -17.006163 ],
        [-441.36496  ,  109.17972  ,   42.743786 , ...,  -14.56104  ,
          -25.479225 ,  -14.851435 ],
        [-474.82578  ,   86.504745 ,   47.789864 , ...,  -20.854565 ,
          -26.170555 ,  -13.820563 ]], dtype=float32),
 'surah': 1,
 'ayah': 1,
 'ayah_text': 'بِسۡمِ ٱللَّهِ ٱلرَّحۡمَٰنِ ٱلرَّحِيمِ'}

In [42]:
# Preparing the features and labels
# Calculate the maximum sequence length
max_len = max(item['mfcc_features'].shape[0] for item in audio_mfccs)

# Pad or truncate MFCC features
X = np.array([np.pad(item['mfcc_features'], ((0, max_len - item['mfcc_features'].shape[0]), (0, 0)), mode='constant') 
              if item['mfcc_features'].shape[0] < max_len 
              else item['mfcc_features'][:max_len] 
              for item in audio_mfccs])
y = [item['ayah_text'] for item in audio_mfccs]

# Encoding the text labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
y_one_hot = to_categorical(y_encoded)

from sklearn.model_selection import train_test_split
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y_one_hot, test_size=0.2, random_state=42)


In [43]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((11, 1136, 13), (3, 1136, 13), (11, 7), (3, 7))

In [46]:

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout


# model = Sequential([
#     # Masking layer for padded inputs
#     Masking(mask_value=0.0, input_shape=(X.shape[1], X.shape[2])),
#     LSTM(128, return_sequences=True),  # LSTM layer
#     Dense(y_one_hot.shape[1], activation='softmax')  # Output layer
# ])
model = Sequential()
# Add LSTM layers
model.add(LSTM(128, return_sequences=True, input_shape=(X.shape[1], X.shape[2])))
model.add(LSTM(64, return_sequences=True))
model.add(LSTM(16, return_sequences=False))



model.add(Dense(16, activation='relu'))

model.add(Dense(y_one_hot.shape[1], activation='softmax'))


model.compile(optimizer='adam', loss='categorical_crossentropy',
              metrics=['accuracy'])
model.summary()




In [47]:
y_one_hot.shape ,X.shape


((14, 7), (14, 1136, 13))

In [49]:
# Train the model
model.fit(X, y_one_hot, epochs=50, batch_size=16, validation_split=0.2, callbacks=callable)

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Accuracy: {accuracy * 100:.2f}%')

Epoch 1/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 624ms/step - accuracy: 0.1818 - loss: 1.7372 - val_accuracy: 0.0000e+00 - val_loss: 2.4192
Epoch 2/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 681ms/step - accuracy: 0.1818 - loss: 1.7244 - val_accuracy: 0.0000e+00 - val_loss: 2.4195
Epoch 3/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 640ms/step - accuracy: 0.3636 - loss: 1.7125 - val_accuracy: 0.0000e+00 - val_loss: 2.4186
Epoch 4/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 632ms/step - accuracy: 0.3636 - loss: 1.7020 - val_accuracy: 0.0000e+00 - val_loss: 2.4168
Epoch 5/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 621ms/step - accuracy: 0.3636 - loss: 1.6918 - val_accuracy: 0.0000e+00 - val_loss: 2.4139
Epoch 6/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 620ms/step - accuracy: 0.3636 - loss: 1.6824 - val_accuracy: 0.0000e+00 - val_loss: 2.4092
Epoch 7/50
[1m1