In [7]:
import boto3
import dotenv
import IPython.display as ipd
import os
import pandas as pd
import re
import torchaudio

from datasets import Dataset, Audio

audio_files = os.listdir('audio/John')
print(audio_files)

['JHN_001_001-001_051.mp3', 'JHN_002_001-002_025.mp3']


In [2]:
start_match = re.search(r'([A-Za-z]+)_(\d{3})_(\d{3})', audio_files[0])
start_book, start_ch, start_vs = start_match.groups()
end_match = re.search(r'([A-Za-z]+)_(\d{3})_(\d{3})-(\d{3})_(\d{3})', audio_files[-1])
end_book, _, _, end_ch, end_vs = end_match.groups()
print("Start: ", start_book, start_ch, start_vs)
print("End: ", end_book, end_ch, end_vs)

Start:  JHN 001 001
End:  JHN 002 025


In [3]:
# load Chichewa John 1 and 2 from text file
with open("nya_jhn1_2.txt", "r", encoding='utf-8') as file:
    nya_jhn_lines = file.readlines()
    nya_jhn_lines = [l.strip() for l in nya_jhn_lines]

# load vref from text file and find start and end index for John 1:1 to John 1:5
with open("vref.txt", "r") as file:
    vref_lines = file.readlines()
    for index, line in enumerate(vref_lines):
        if line.strip() == f"{start_book} {str(int(start_ch))}:{str(int(start_vs))}":
            start_index = index
        if line.strip() == f"{end_book} {str(int(end_ch))}:{str(int(end_vs))}":
            end_index = index
            break

# view first three lines of John 1
for i in range(start_index, start_index + 3):
    print(vref_lines[i], nya_jhn_lines[i])

vref_lines = vref_lines[start_index:end_index]
nya_jhn_lines = nya_jhn_lines[start_index:end_index]

JHN 1:1
 Pachiyambi panali Mawu, ndipo Mawu anali kwa Mulungu, ndipo Mawu ndiye Mulungu.
JHN 1:2
 Mawuwa anali ndi Mulungu pachiyambi.
JHN 1:3
 Zinthu zonse zinalengedwa ndi Iye; ndipo popanda Iye sikukanakhala kanthu kalikonse kolengedwa.


In [4]:
ref_data = [{'key': vl, 'text': rl} for vl, rl in zip(vref_lines, nya_jhn_lines)]
ref_data_items = [[verse for verse in ref_data if verse['key'].startswith('JHN 1')], 
                  [verse for verse in ref_data if verse['key'].startswith('JHN 2')]]
ref_data_items[0][0]

{'key': 'JHN 1:1\n',
 'text': 'Pachiyambi panali Mawu, ndipo Mawu anali kwa Mulungu, ndipo Mawu ndiye Mulungu.'}

In [5]:
def get_s3_object_urls(bucket_name, folder_prefix):
  s3_client = boto3.client('s3')
  object_urls = []
  response = s3_client.list_objects_v2(Bucket=bucket_name, Prefix=folder_prefix)
  if 'Contents' in response:
    for obj in response['Contents']:
      object_url = f"https://{bucket_name}.s3.amazonaws.com/{obj['Key']}"
      object_urls.append(object_url.replace(f"https://", "s3://").replace('.s3.amazonaws.com', ''))

  return [url for url in object_urls if url.endswith((".wav", ".mp3", ".m4a"))]

bucket = 'forcedalignment'
folder = 'input_data/audio/Chichewa_John_1-2/'

s3paths = get_s3_object_urls(bucket, folder)
s3paths

['s3://forcedalignment/input_data/audio/Chichewa_John_1-2/JHN_001_001-001_051.mp3',
 's3://forcedalignment/input_data/audio/Chichewa_John_1-2/JHN_002_001-002_025.mp3']

In [13]:
audio_files = [{
    #"filename": s3paths[i].split("/")[-1],
    "s3_path": s3paths[i],
    #"ref_text": [{'text': item['text']} for item in ref_data_items[i]]
    "ref_text": ref_data_items[i]
} for i in range(len(s3paths))]
audio_files

[{'s3_path': 's3://forcedalignment/input_data/audio/Chichewa_John_1-2/JHN_001_001-001_051.mp3',
  'ref_text': [{'key': 'JHN 1:1\n',
    'text': 'Pachiyambi panali Mawu, ndipo Mawu anali kwa Mulungu, ndipo Mawu ndiye Mulungu.'},
   {'key': 'JHN 1:2\n', 'text': 'Mawuwa anali ndi Mulungu pachiyambi.'},
   {'key': 'JHN 1:3\n',
    'text': 'Zinthu zonse zinalengedwa ndi Iye; ndipo popanda Iye sikukanakhala kanthu kalikonse kolengedwa.'},
   {'key': 'JHN 1:4\n',
    'text': 'Mwa Iye munali moyo, ndipo moyowo unali kuwunika kwa anthu.'},
   {'key': 'JHN 1:5\n',
    'text': 'Kuwunika kunawala mu mdima, koma mdimawo sunakuzindikire.'},
   {'key': 'JHN 1:6\n',
    'text': 'Kunabwera munthu amene anatumizidwa kuchokera kwa Mulungu; Iyeyo dzina lake linali Yohane.'},
   {'key': 'JHN 1:7\n',
    'text': 'Iye anabwera ngati mboni kudzachitira umboni kuwunikako kuti kudzera mwa iye anthu onse akhulupirire.'},
   {'key': 'JHN 1:8\n',
    'text': 'Iyeyu sanali kuwunika; koma anabwera ngati mboni ya kuw

In [16]:
audio_files[1]['s3_path']

's3://forcedalignment/input_data/audio/Chichewa_John_1-2/JHN_002_001-002_025.mp3'

In [17]:
import modal
import pandas as pd

fa_class = modal.Cls.lookup("general-forced-alignment", "ForcedAligner")
fa = fa_class()
fa_results = fa.align_from_s3.remote(
            audio_files=audio_files,
            romanize=False,
        )
fa_results_df = pd.DataFrame.from_dict(fa_results)
fa_results_df

C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.12_3.12.2800.0_x64__qbz5n2kfra8p0\Lib\asyncio\events.py:88: DeprecationError: 2025-01-27: `modal.Cls.lookup` is deprecated and will be removed in a future release. It can be replaced with `modal.Cls.from_name`.

See https://modal.com/docs/guide/modal-1-0-migration for more information.
  self._context.run(self._callback, *self._args)


Unnamed: 0,filename,text,source_file,start,end,asr_transcription,match_score
0,output/JHN_1_1.wav,"Pachiyambi panali Mawu, ndipo Mawu anali kwa M...",JHN_001_001-001_051,1.180,8.180,patiani panardima dipomau anadikamunu dipoma n...,52
1,output/JHN_1_2.wav,Mawuwa anali ndi Mulungu pachiyambi.,JHN_001_001-001_051,9.540,12.300,maowa annidimuugu patiadi,62
2,output/JHN_1_3.wav,Zinthu zonse zinalengedwa ndi Iye; ndipo popan...,JHN_001_001-001_051,13.380,22.440,zitizosi zinanigedua nde dipopopanai sukunaga ...,61
3,output/JHN_1_4.wav,"Mwa Iye munali moyo, ndipo moyowo unali kuwuni...",JHN_001_001-001_051,23.240,29.601,mai unanimou dipomoyao kunadi kuni ca kantu,67
4,output/JHN_1_5.wav,"Kuwunika kunawala mu mdima, koma mdimawo sunak...",JHN_001_001-001_051,31.120,37.561,kunica unaana mum dima komam dima sunkuzindigi,78
...,...,...,...,...,...,...,...
70,output/JHN_2_20.wav,"Ayuda anayankha kuti, “Zinatenga zaka makumi a...",JHN_002_001-002_025,238.355,256.696,anu ba amunugui dipoine ti zamangaso masiatan ...,66
71,output/JHN_2_21.wav,Koma Nyumba ya Mulungu imene amanena linali th...,JHN_002_001-002_025,258.176,262.717,komanu ba mu imin amanina linri tibiraki,61
72,output/JHN_2_22.wav,"Iye ataukitsidwa kwa akufa, ophunzira ake anak...",JHN_002_001-002_025,264.557,276.098,e anogizitua kakufa kopusirake anakubugiras mi...,66
73,output/JHN_2_23.wav,Tsopano Iye ali mu Yerusalemu pa phwando la Pa...,JHN_002_001-002_025,277.518,288.938,sopanoi rimerusarim papanolapasc anpabiri anon...,74


In [21]:
# Split audio files into segments based on timestamps and save results

source_file = fa_results_df['source_file'][0]
aud, sr = torchaudio.load('audio/John/' + source_file + '.mp3')

for index, row in fa_results_df.iterrows():
    if row['source_file'] != source_file:
        source_file = row['source_file']
        aud, sr = torchaudio.load('audio/John/' + source_file + '.mp3')
    filename = row['filename']
    if sr != 16000:
        aud = torchaudio.functional.resample(aud, orig_freq=sr, new_freq=16000)
        sr = 16000
    start = row['start']
    end = row['end']
    sub_aud = aud[:, int(start*sr):int(end*sr)]
    torchaudio.save(filename, sub_aud, sr)
    #ipd.display(ipd.Audio(sub_aud, rate=sr))

In [None]:
# Check that the alignment is correct

test_verse = 'output/JHN_2_2.wav'
aud, sr = torchaudio.load(test_verse)
print(fa_results_df[fa_results_df['filename'] == test_verse].reset_index(
    drop=True).loc[0]['text'])
ipd.display(ipd.Audio(aud, rate=sr))

In [None]:
# Optionally upload results to Hugging Face

hf_dataset = "sil-ai/nya_jhn1_2"
dotenv.load_dotenv()
HF_TOKEN = os.getenv("HF_TOKEN")

ds = Dataset.from_pandas(fa_results_df) 
ds = ds.cast_column("filename", Audio(sampling_rate=16_000))
ds = ds.rename_column("filename", "audio")
sample = ds[1]
print(sample["text"])
print(sample["audio"]["sampling_rate"], len(sample["audio"]["array"]))
ds.push_to_hub(hf_dataset, private=True, token=HF_TOKEN)