In [59]:
import dotenv
import IPython.display as ipd
import os
import re
import torchaudio

from datasets import Dataset, Audio

audio_files = os.listdir('audio/John')
print(audio_files)
# Audio files need to be in the following format: 
# [book]_[start_chapter]_[start_verse]-[end_chapter]_[end_verse].wav

  from .autonotebook import tqdm as notebook_tqdm


['JHN_001_001-001_051.mp3', 'JHN_002_001-002_025.mp3']


In [13]:
match = re.search(r'([A-Za-z]+)_(\d{3})_(\d{3})-(\d{3})_(\d{3})', audio_files[0])
book, start_ch, start_vs, end_ch, end_vs = match.groups()

# load Chichewa John 1 and 2 from text file
with open("nya_jhn1_2.txt", "r") as file:
    nya_jhn_lines = file.readlines()

# load vref from text file and find start and end index for John 1:1 to John 1:5
with open("vref.txt", "r") as file:
    vref_lines = file.readlines()
    for index, line in enumerate(vref_lines):
        if line.strip() == f"{book} {str(int(start_ch))}:{str(int(start_vs))}":
            start_index = index
            end_index = start_index + 3
            break

# view first three lines of John 1
for i in range(start_index, end_index):
    print(vref_lines[i], nya_jhn_lines[i])


JHN 1:1
 Pachiyambi panali Mawu, ndipo Mawu anali kwa Mulungu, ndipo Mawu ndiye Mulungu. 

JHN 1:2
 Mawuwa anali ndi Mulungu pachiyambi.

JHN 1:3
 Zinthu zonse zinalengedwa ndi Iye; ndipo popanda Iye sikukanakhala kanthu kalikonse kolengedwa. 



In [None]:
# Listen to audio for first chapter of John
aud, sr = torchaudio.load('audio/John/' + audio_files[0])
ipd.display(ipd.Audio(aud, rate=sr, normalize=False))

In [None]:
# If necessary, trim off first part of audio before verse 1 begins:
aud2 = aud[:, int(sr*9):]
#torchaudio.save('audio/John/' + audio_files[0], aud2, sr)
ipd.display(ipd.Audio(aud2, rate=sr, normalize=False))

In [40]:
# Format audio file name, text file, and ref file as shown above before uploading to S3
# Once files are saved in S3, run the following code to align text and audio

import modal

fa_class = modal.Cls.lookup("forced-alignment", "ForcedAligner")
fa = fa_class()
fa.load_model.remote("facebook/wav2vec2-base-960h")  # Use orthographic model

fa_results_df = fa.align.remote(
    s3_audio_key="input_data/audio/Chichewa_John_1-2", # folder in S3 bucket where audio files are stored
    s3_text_key="input_data/text/nya_jhn1_2.txt", # text file with Chichewa text
    s3_ref_key="input_data/text/vref.txt" # text file with references
)

# Returns a dataframe with timestamps for splitting source audio
fa_results_df

Unnamed: 0,filename,text,source_file,start,end
0,output/JHN_1_1.wav,"Pachiyambi panali Mawu, ndipo Mawu anali kwa M...",JHN_001_001-001_051.mp3,1.180,8.180
1,output/JHN_1_2.wav,Mawuwa anali ndi Mulungu pachiyambi.,JHN_001_001-001_051.mp3,9.540,12.300
2,output/JHN_1_3.wav,Zinthu zonse zinalengedwa ndi Iye; ndipo popan...,JHN_001_001-001_051.mp3,13.380,22.440
3,output/JHN_1_4.wav,"Mwa Iye munali moyo, ndipo moyowo unali kuwuni...",JHN_001_001-001_051.mp3,23.240,29.601
4,output/JHN_1_5.wav,"Kuwunika kunawala mu mdima, koma mdimawo sunak...",JHN_001_001-001_051.mp3,31.120,37.561
...,...,...,...,...,...
71,output/JHN_2_21.wav,Koma Nyumba ya Mulungu imene amanena linali th...,JHN_002_001-002_025.mp3,258.416,264.897
72,output/JHN_2_22.wav,"Iye ataukitsidwa kwa akufa, ophunzira ake anak...",JHN_002_001-002_025.mp3,264.977,277.658
73,output/JHN_2_23.wav,Tsopano Iye ali mu Yerusalemu pa phwando la Pa...,JHN_002_001-002_025.mp3,277.718,290.059
74,output/JHN_2_24.wav,"Koma Yesu sanawakhulupirire iwo, pakuti amadzi...",JHN_002_001-002_025.mp3,290.119,296.919


In [44]:
# Split audio files into segments based on timestamps and save results

source_file = fa_results_df['source_file'][0]
aud, sr = torchaudio.load('audio/John/' + source_file)

for index, row in fa_results_df.iterrows():
    if row['source_file'] != source_file:
        source_file = row['source_file']
        aud, sr = torchaudio.load('audio/John/' + source_file)
    filename = row['filename']
    if sr != 16000:
        aud = torchaudio.functional.resample(aud, orig_freq=sr, new_freq=16000)
        sr = 16000
    start = row['start']
    end = row['end']
    sub_aud = aud[:, int(start*sr):int(end*sr)]
    torchaudio.save(filename, sub_aud, sr)
    #ipd.display(ipd.Audio(sub_aud, rate=sr))

In [57]:
# Check that the alignment is correct

test_verse = 'output/JHN_2_2.wav'
aud, sr = torchaudio.load(test_verse)
print(fa_results_df[fa_results_df['filename'] == test_verse].reset_index(drop=True).loc[0]['text'])
ipd.display(ipd.Audio(aud, rate=sr))

ndipo Yesu ndi ophunzira ake anayitanidwanso ku ukwatiwo. 


In [None]:
# Optionally upload results to Hugging Face

hf_dataset = "sil-ai/nya_jhn1_2"
dotenv.load_dotenv()
HF_TOKEN = os.getenv("HF_TOKEN")

ds = Dataset.from_pandas(fa_results_df) 
ds = ds.cast_column("filename", Audio(sampling_rate=16_000))
ds = ds.rename_column("filename", "audio")
sample = ds[1]
print(sample["text"])
print(sample["audio"]["sampling_rate"], len(sample["audio"]["array"]))
ds.push_to_hub(hf_dataset, private=True, token=HF_TOKEN)