In [None]:
import dotenv
import IPython.display as ipd
import os
import pandas as pd
import re
import torchaudio

from datasets import Dataset, Audio

audio_files = os.listdir('audio/John')
print(audio_files)
# Audio files need to be in the following format: 
# [book]_[start_chapter]_[start_verse]-[end_chapter]_[end_verse].wav

['JHN_001_001-001_051.mp3', 'JHN_002_001-002_025.mp3']


In [2]:
start_match = re.search(r'([A-Za-z]+)_(\d{3})_(\d{3})', audio_files[0])
start_book, start_ch, start_vs = start_match.groups()
end_match = re.search(r'([A-Za-z]+)_(\d{3})_(\d{3})-(\d{3})_(\d{3})', audio_files[-1])
end_book, _, _, end_ch, end_vs = end_match.groups()
print("Start: ", start_book, start_ch, start_vs)
print("End: ", end_book, end_ch, end_vs)

Start:  JHN 001 001
End:  JHN 002 025


In [3]:
# load Chichewa John 1 and 2 from text file
with open("nya_jhn1_2.txt", "r", encoding='utf-8') as file:
    nya_jhn_lines = file.readlines()
    nya_jhn_lines = [l.strip() for l in nya_jhn_lines]

# load vref from text file and find start and end index for John 1:1 to John 1:5
with open("vref.txt", "r") as file:
    vref_lines = file.readlines()
    for index, line in enumerate(vref_lines):
        if line.strip() == f"{start_book} {str(int(start_ch))}:{str(int(start_vs))}":
            start_index = index
        if line.strip() == f"{end_book} {str(int(end_ch))}:{str(int(end_vs))}":
            end_index = index
            break

# view first three lines of John 1
for i in range(start_index, start_index + 3):
    print(vref_lines[i], nya_jhn_lines[i])

vref_lines = vref_lines[start_index:end_index]
nya_jhn_lines = nya_jhn_lines[start_index:end_index]

JHN 1:1
 Pachiyambi panali Mawu, ndipo Mawu anali kwa Mulungu, ndipo Mawu ndiye Mulungu.
JHN 1:2
 Mawuwa anali ndi Mulungu pachiyambi.
JHN 1:3
 Zinthu zonse zinalengedwa ndi Iye; ndipo popanda Iye sikukanakhala kanthu kalikonse kolengedwa.


In [4]:
ref_data = [{'key': vl, 'text': rl} for vl, rl in zip(vref_lines, nya_jhn_lines)]
ref_data_items = [[verse for verse in ref_data if verse['key'].startswith('JHN 1')], 
                  [verse for verse in ref_data if verse['key'].startswith('JHN 2')]]
ref_data_items[0][0]

{'key': 'JHN 1:1\n',
 'text': 'Pachiyambi panali Mawu, ndipo Mawu anali kwa Mulungu, ndipo Mawu ndiye Mulungu.'}

In [None]:
import boto3

def get_s3_object_urls(bucket_name, folder_prefix):
  s3_client = boto3.client('s3')
  object_urls = []
  response = s3_client.list_objects_v2(Bucket=bucket_name, Prefix=folder_prefix)
  if 'Contents' in response:
    for obj in response['Contents']:
      object_url = f"https://{bucket_name}.s3.amazonaws.com/{obj['Key']}"
      object_urls.append(object_url.replace(f"https://", "s3://").replace('.s3.amazonaws.com', ''))

  return [url for url in object_urls if url.endswith((".wav", ".mp3", ".m4a"))]

bucket = 'forcedalignment'
folder = 'input_data/audio/Chichewa_John_1-2/'

s3paths = get_s3_object_urls(bucket, folder)
s3paths

In [None]:
audio_files = [{
    #"filename": s3paths[i].split("/")[-1],
    "s3_path": s3paths[i],
    #"ref_text": [{'text': item['text']} for item in ref_data_items[i]]
    "ref_text": ref_data_items[i]
} for i in range(len(s3paths))]
audio_files

In [48]:
import modal

fa_class = modal.Cls.lookup("general-forced-alignment", "ForcedAligner")
fa = fa_class()

fa.load_model.remote("facebook/wav2vec2-base-960h")
fa_results = fa.align_from_s3.remote(
        audio_files=audio_files,
        romanize=False,
    )
fa_results_df = pd.DataFrame.from_dict(fa_results)
fa_results_df.head(2)

C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.12_3.12.2800.0_x64__qbz5n2kfra8p0\Lib\asyncio\events.py:88: DeprecationError: 2025-01-27: `modal.Cls.lookup` is deprecated and will be removed in a future release. It can be replaced with `modal.Cls.from_name`.

See https://modal.com/docs/guide/modal-1-0-migration for more information.
  self._context.run(self._callback, *self._args)


Unnamed: 0,filename,text,source_file,start,end,asr_transcription
0,output/JHN_001_001-001_051_0.wav,"Pachiyambi panali Mawu, ndipo Mawu anali kwa M...",JHN_001_001-001_051,1.18,8.18,patiani panardima dipomau anadikamunu dipoma n...
1,output/JHN_001_001-001_051_1.wav,Mawuwa anali ndi Mulungu pachiyambi.,JHN_001_001-001_051,9.54,12.3,maowa annidimuugu patiadi


In [43]:
from fuzzywuzzy import fuzz
fa_results_df.reset_index(drop=True, inplace=True)
fa_results_df['match_score'] = fa_results_df.apply(lambda row: fuzz.ratio(row['text'], row['asr_transcription']), axis=1)
fa_results_df.sort_values(by='match_score', ascending=False).head()

Unnamed: 0,filename,text,source_file,start,end,asr_transcription,match_score
19,output/JHN_001_001-001_051.mp3_19.wav,"Iye sanalephere kuvomereza, koma iye anavomere...",JHN_001_001-001_051.mp3,197.063,205.243,esanadi pere kukumereza komai anapomerza momas...,81
33,output/JHN_001_001-001_051.mp3_33.wav,Ine ndaona ndipo ndikuchitira umboni kuti uyu ...,JHN_001_001-001_051.mp3,356.706,362.646,ine daona dipo nkucitira kumoni kutikuyu diman...,79
4,output/JHN_001_001-001_051.mp3_4.wav,"Kuwunika kunawala mu mdima, koma mdimawo sunak...",JHN_001_001-001_051.mp3,31.12,37.561,kunica unaana mum dima komam dima sunkuzindigi,78
7,output/JHN_001_001-001_051.mp3_7.wav,Iyeyu sanali kuwunika; koma anabwera ngati mbo...,JHN_001_001-001_051.mp3,56.901,62.521,e sanadikuniga koma anamera gatiboni yakunica,77
23,output/JHN_001_001-001_051.mp3_23.wav,Tsopano Afarisi ena amene anatumidwa,JHN_001_001-001_051.mp3,242.684,245.224,topanafarisina amini anarumida,76


In [None]:
# Split audio files into segments based on timestamps and save results

source_file = fa_results_df['source_file'][0]
aud, sr = torchaudio.load('audio/John/' + source_file)

for index, row in fa_results_df.iterrows():
    if row['source_file'] != source_file:
        source_file = row['source_file']
        aud, sr = torchaudio.load('audio/John/' + source_file)
    filename = row['filename']
    if sr != 16000:
        aud = torchaudio.functional.resample(aud, orig_freq=sr, new_freq=16000)
        sr = 16000
    start = row['start']
    end = row['end']
    sub_aud = aud[:, int(start*sr):int(end*sr)]
    torchaudio.save(filename, sub_aud, sr)
    #ipd.display(ipd.Audio(sub_aud, rate=sr))

In [None]:
# Check that the alignment is correct

test_verse = 'output/JHN_2_2.wav'
aud, sr = torchaudio.load(test_verse)
print(fa_results_df[fa_results_df['filename'] == test_verse].reset_index(
    drop=True).loc[0]['text'])
ipd.display(ipd.Audio(aud, rate=sr))

In [None]:
# Optionally upload results to Hugging Face

hf_dataset = "sil-ai/nya_jhn1_2"
dotenv.load_dotenv()
HF_TOKEN = os.getenv("HF_TOKEN")

ds = Dataset.from_pandas(fa_results_df) 
ds = ds.cast_column("filename", Audio(sampling_rate=16_000))
ds = ds.rename_column("filename", "audio")
sample = ds[1]
print(sample["text"])
print(sample["audio"]["sampling_rate"], len(sample["audio"]["array"]))
ds.push_to_hub(hf_dataset, private=True, token=HF_TOKEN)