In [None]:
import json
# Load JSON data from a file
with open('ayahs.json', 'r', encoding='utf-8') as file:
    data = json.load(file)
data

In [None]:
from pydub import AudioSegment
import os

def convert_mp3_to_wav(mp3_folder, wav_folder):
    # List all MP3 files in the input folder
    for file_name in os.listdir(mp3_folder):

        if file_name.endswith('.mp3'):
            mp3_path = os.path.join(mp3_folder, file_name)
            wav_file_name = file_name.replace('.mp3', '.wav')
            wav_path = os.path.join(wav_folder, wav_file_name)
            print(wav_path)
            try:
                # Load MP3 and export as WAV
                audio = AudioSegment.from_mp3(mp3_path)
                audio.export(wav_path, format='wav')
                # print(f'Converted {mp3_path} to {wav_path}')
            except Exception as e:
                print(f'Failed to convert {mp3_path} to WAV: {e}')


In [None]:
# Load timings JSON data (update this with the actual path or code to get the timings data)
with open('timings.json', 'r', encoding='utf-8') as timingsfile:
    timings = json.load(timingsfile)

words = []
for item in data:
  for wordIndex, word in enumerate(item.get('words')):
      last = True
      if wordIndex < len(item.get('words'))-1:
          last = False
      words.append({
          "word_number": wordIndex + 1,
          "word": word,
          "last": last,
          "surah_number": item.get('surah_number'),
          "verse_number": item.get('verse_number'),
      })

print(words)
# Loop through your surah files

def segmented_ayah_audio_to_word(input_dir,surah, reciter):
    output_dir = f'word_by_word/{reciter}/{surah}'
    for ayah_index, ayah_path in enumerate(os.listdir(input_dir)):
        
        # Create the output directory if it doesn't exist
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        # 114  30 juz 60 hizb 240 rb3 6263 ayha 77426 word
        # Iterate over each item in your JSON data
        for index, (item, time) in enumerate(zip(words, timings)):
            try:
                surah_number = item.get('surah_number')
                ayah_number = item.get('verse_number')
                word = item.get('word')
                last = item.get('last')
                timing = float(time.get('alafasy_timing'))   # Changed 'ID' to 'alafasy_timing' following the Timings JSON reciter_name
                word_number = item.get('word_number')
                if ayah_number == ayah_index+1:
                    print(f"Item: {word}, Time: {timing}")
                if surah_number == 2:
                    break
                
                file_name = os.path.basename(ayah_path)
                print(file_name)
                if file_name != f'{surah_number:03}{ayah_number:03}.wav':
                    continue
                # # Load the audio file
                print(f"Processing {input_dir}/{ayah_path}".replace('\\', '/'))
                audio = AudioSegment.from_file(f'{input_dir}/{ayah_path}'.replace('\\', '/'))
                            
                if not surah_number or not ayah_number or timing is None:
                    print(f"Skipping item due to missing data: {item}, {time}")
                    continue

                # Create directories for and ayah
                ayah_dir = f"{output_dir}/ayah_{ayah_number}"

                os.makedirs(ayah_dir, exist_ok=True)

                # Calculate start and end times in milliseconds
                start_time = timing * 1000  # convert to milliseconds
                next_timing = float(timings[index + 1].get('alafasy_timing', timing + 0.1))
                if last:
                   # Extract segm-ent
                    segment = audio[start_time:]
                else:
                    end_time = start_time + ((next_timing ) - timing) * 1000
                    # Extract segm-ent
                    segment = audio[start_time:end_time]
                    
                # Export segment
                # word_filename = f"word_{item.get('word_number', f'{index +1}')}.wav"  # Changed 'ID' to 'word_number'
                segment.export(os.path.join(ayah_dir, f"{word_number}.wav"), format="wav")

                print(f"Exported {word_number} to {ayah_dir} from {start_time}ms to {end_time}ms")

            except Exception as e:
                print(f"An error occurred: {e}")

In [None]:
def prepare_segmented_audio_files():
    base_folder = 'downloaded_audio_files'
    segmented_audio_folder = 'segmented_audio_files'

    if not os.path.exists(segmented_audio_folder):
        os.makedirs(segmented_audio_folder)

    for reciter in os.listdir(base_folder):
        segmented_reciter_folder = os.path.join(segmented_audio_folder, reciter)
        if not os.path.exists(segmented_reciter_folder):
            os.makedirs(segmented_reciter_folder)
        
        
        for surah in os.listdir(os.path.join(base_folder, reciter)):
            base_surah_folder = os.path.join(base_folder, reciter, surah)
            segmented_surah_folder = os.path.join(segmented_audio_folder, reciter, surah)
            if not os.path.exists(segmented_surah_folder):
                os.makedirs(segmented_surah_folder)
            # convet mp3 to wav 
            convert_mp3_to_wav(base_surah_folder, segmented_surah_folder)
            segmented_ayah_audio_to_word(segmented_surah_folder, surah, reciter)

prepare_segmented_audio_files()


In [None]:
# Loop through your surah files
reciter_name = ["alafasy","husary"]
for item in data:
    # Define paths
    if item.get("surah_number") == 2:
        break
    input_ayah_audios_path = f"word_by_word/{reciter_name[0]}/surah_{item.get("surah_number")}/ayah_{item.get("verse_number")}"
    print(f"ayah audio: ayah_{item.get("verse_number")}")
    # Iterate over each file in the directory
    filenames = sorted(os.listdir(input_ayah_audios_path))
    audioList = []
    for wordindex,filename  in enumerate(filenames):
      words =item.get("words")
      file_path = f"{input_ayah_audios_path}/{filename}"
      if os.path.isfile(file_path):
          # Process the file
          print(f"Processing file: {file_path} {words[wordindex]}")
          audioList.append({
              "word_number": wordindex + 1,
              "word": words[wordindex],
              "audio_path": file_path,
          })
    item["words_audios"] = audioList
# Save the updated JSON data to a file (optional)
with open(f'{reciter_name[0]}_data.json', 'w', encoding='utf-8') as f:
    json.dump(data, f, ensure_ascii=False, indent=4)

# Print the updated data to verify
# print(json.dumps(data, ensure_ascii=False, indent=4))