In [1]:
import boto3
import dotenv
import IPython.display as ipd
import os
import pandas as pd
import re
import torchaudio

from datasets import Dataset, Audio

audio_files = os.listdir('audio/John')
print(audio_files)

['JHN_002_001-002_025.mp3', 'JHN_001_001-001_051.mp3']


In [2]:
start_match = re.search(r'([A-Za-z]+)_(\d{3})_(\d{3})', audio_files[0])
start_book, start_ch, start_vs = start_match.groups()
end_match = re.search(r'([A-Za-z]+)_(\d{3})_(\d{3})-(\d{3})_(\d{3})', audio_files[-1])
end_book, _, _, end_ch, end_vs = end_match.groups()
print("Start: ", start_book, start_ch, start_vs)
print("End: ", end_book, end_ch, end_vs)

Start:  JHN 002 001
End:  JHN 001 051


In [None]:
# load Chichewa John 1 and 2 from text file
with open("nya_jhn1_2.txt", "r", encoding='utf-8') as file:
    nya_jhn_lines = file.readlines()
    nya_jhn_lines = [l.strip() for l in nya_jhn_lines]

# load vref from text file and find start and end index for John 1:1 to John 1:5
with open("vref.txt", "r") as file:
    vref_lines = file.readlines()
    for index, line in enumerate(vref_lines):
        if line.strip() == f"{start_book} {str(int(start_ch))}:{str(int(start_vs))}":
            start_index = index
        if line.strip() == f"{end_book} {str(int(end_ch))}:{str(int(end_vs))}":
            end_index = index
            break

# view first three lines of John 1
#for i in range(start_index, start_index + 3):
#    print(vref_lines[i], nya_jhn_lines[i])
#
#vref_lines = vref_lines[start_index:end_index]
#nya_jhn_lines = nya_jhn_lines[start_index:end_index]

In [None]:
ref_data = [{'key': vl, 'text': rl} for vl, rl in zip(vref_lines, nya_jhn_lines)]
ref_data_items = [[verse for verse in ref_data if verse['key'].startswith('JHN 1')], 
                  [verse for verse in ref_data if verse['key'].startswith('JHN 2')]]
ref_data_items[0][0]

{'key': 'JHN 1:1\n',
 'text': 'Pachiyambi panali Mawu, ndipo Mawu anali kwa Mulungu, ndipo Mawu ndiye Mulungu.'}

In [4]:
file_path = "nepali_deva5s.txt"
file_paths = [file_path]
ref_data_items = []
for file_path in file_paths:
    with open(file_path, 'r') as file:
        file_data = []
        for line in file:
            key, text = line.split('|')
            file_data.append({'key': key, 'text': text.strip()})
    ref_data_items.append(file_data)
ref_data_items[0][:3]

[{'key': '1a ', 'text': 'नमस्ते'},
 {'key': '1b ', 'text': 'मेरो नाम सम्झना थापा हो'},
 {'key': '1c ', 'text': 'म नेपालमा बस्छु'}]

In [5]:
def get_s3_object_urls(bucket_name, folder_prefix):
  s3_client = boto3.client('s3')
  object_urls = []
  response = s3_client.list_objects_v2(Bucket=bucket_name, Prefix=folder_prefix)
  if 'Contents' in response:
    for obj in response['Contents']:
      object_url = f"https://{bucket_name}.s3.amazonaws.com/{obj['Key']}"
      object_urls.append(object_url.replace(f"https://", "s3://").replace('.s3.amazonaws.com', ''))

  return [url for url in object_urls if url.endswith((".wav", ".mp3", ".m4a"))]

bucket = 'forcedalignment'
folder = 'input_data/audio/Chichewa_John_1-2/'

s3paths = get_s3_object_urls(bucket, folder)
s3paths

['s3://forcedalignment/input_data/audio/Chichewa_John_1-2/JHN_001_001-001_051.mp3',
 's3://forcedalignment/input_data/audio/Chichewa_John_1-2/JHN_002_001-002_025.mp3']

In [None]:
file_path = "MYK-10-MAT-02-phrases.txt"
#file_path = 'nepali_deva5s.txt'
#file_path = 'nepali5s.txt'
file_paths = [file_path]
ref_data_items = []
for file_path in file_paths:
    with open(file_path, 'r') as file:
        file_data = []
        for line in file:
            key, text = line.split('|')
            file_data.append({'key': key, 'text': text.strip()})
    ref_data_items.append(file_data)
audio_files = [
    {'s3_path': 's3://forcedalignment/Matthieu/B01___02_Matthieu____MYKWBTN2DA.mp3',
    #{'s3_path': 's3://forcedalignment/Matthieunepali_5s_denoised.wav',
    'ref_text': ref_data_items[0]}
]
audio_files

import modal
import pandas as pd

# Get the Modal app (new function-based approach)
align_from_s3_parallel = modal.Function.lookup("general-forced-alignment", 
#align_from_dataset_parallel = modal.Function.lookup("general-forced-alignment",
"align_from_s3_parallel",
#"align_from_dataset_parallel",
environment_name="dev"
)

fa_results = align_from_s3_parallel.remote(
    audio_files=audio_files,
    romanize=True,
    #mms_lang='npi',
    mms_lang='swh',
    batch_size=1,     # Process files in parallel
    output_dataset_name=None  # Don't create HF dataset, just return results
)

#fa_results = align_from_dataset_parallel.remote(
#    dataset_name="sil-ai/senga-nt-asr-inferred",
#    text_column="text",
#    audio_column="audio",
#    max_words=80,
#    romanize=False,
#    mms_lang="swh",
#    output_dataset_name="sil-ai/forced-alignment-API-test",
#    batch_size=10,
#    limit=10,  # Don't process all 1000+ files, just a few for testing
#)

fa_results_df = pd.DataFrame.from_dict(fa_results)
fa_results_df

In [34]:
import modal
import pandas as pd
fa = modal.Function.from_name(
    "general-forced-alignment",
    "align_from_dataset_parallel",
    environment_name='dev',
)
fa.hydrate()
# Prepare the arguments, filtering out None values
align_args = {
            "dataset_name": "sil-ai/senga-nt-asr-inferred",
            "text_column": "text",
            "audio_column": "audio",
            "max_words": 80,
            "romanize": False,
            "mms_lang": "swh",
            "output_dataset_name": "sil-ai/forced-alignment-API-test",
            "limit": 10,  # Don't process all 1000+ files, just a few for testing
}
# Remove None values
align_args = {k: v for k, v in align_args.items() if v is not None}
result = fa.remote(**align_args)
print(type(result))
print(result) 

<class 'list'>
[{'chunk_id': 'train_sample_0_chunk_0', 'text': 'Lomba ndipo upiwenge, Penja ndipo usangenge, khokhosha ndipo kujulikenge kwa iwe. Poti aliyose uyo olomba opokela, aliyose uyo openja osanga, ndipo uyo okhokhosha chijalo chijulikenge. Kasi linjani wa imwe pala mwana wake walomba chibama omupa libwe? Panji pala alomba somba, omupa njoka? Pala imwe baheni, mumanya kupeleka vyawanangwa viweme ku bana bini. Naye usemwe kuchanya, opeleka vyawanangwa viwemwe mozyumbisha ku abo bomulomba, chitilani banyinu umo mupenjela kuti nabo bamuchitilani. Ichi ndicho chiyowoyeka mu lamulo na zinchimi.', 'source_sample_id': 'train_sample_0', 'chunk_index': 0, 'start': 19.368, 'end': 19.468, 'duration': 0.1, 'word_count': 76, 'asr_transcription': '', 'match_score': 0, 'dataset_name': 'sil-ai/senga-nt-asr-inferred', 'split': 'train'}, {'chunk_id': 'train_sample_1_chunk_0', 'text': 'Ndipo nkhawona mungelo kukhilila kufuma kuchanya na chivumbila chikhululu chambula kugota na chicheni chikulu ch

In [None]:
import modal
import pandas as pd

fa_class = modal.Cls.lookup("general-forced-alignment", "ForcedAligner")
fa = fa_class()
fa_results = fa.align_from_s3.remote(
            audio_files=audio_files,
            romanize=False,
        )
fa_results_df = pd.DataFrame.from_dict(fa_results)
fa_results_df

In [21]:
# Split audio files into segments based on timestamps and save results

source_file = fa_results_df['source_file'][0]
aud, sr = torchaudio.load('audio/John/' + source_file + '.mp3')

for index, row in fa_results_df.iterrows():
    if row['source_file'] != source_file:
        source_file = row['source_file']
        aud, sr = torchaudio.load('audio/John/' + source_file + '.mp3')
    filename = row['filename']
    if sr != 16000:
        aud = torchaudio.functional.resample(aud, orig_freq=sr, new_freq=16000)
        sr = 16000
    start = row['start']
    end = row['end']
    sub_aud = aud[:, int(start*sr):int(end*sr)]
    torchaudio.save(filename, sub_aud, sr)
    #ipd.display(ipd.Audio(sub_aud, rate=sr))

In [None]:
# Check that the alignment is correct

test_verse = 'output/JHN_2_2.wav'
aud, sr = torchaudio.load(test_verse)
print(fa_results_df[fa_results_df['filename'] == test_verse].reset_index(
    drop=True).loc[0]['text'])
ipd.display(ipd.Audio(aud, rate=sr))

In [None]:
# Example: Using sister language transcription for better ASR quality
# This is useful when you have audio in a language that's similar to a well-supported language

"""
# Example with sister language transcription (e.g., using French for a French-related language)
import modal
import pandas as pd

# Get the Modal function reference
align_from_s3_parallel = modal.Function.lookup("general-forced-alignment", "align_from_s3_parallel")

# Call with sister language support
fa_results_sister = align_from_s3_parallel.remote(
    audio_files=audio_files,
    romanize=False,
    sister_lang="fra",  # Use French as sister language for better ASR
    batch_size=2,
    output_dataset_name=None
)

fa_results_sister_df = pd.DataFrame.from_dict(fa_results_sister)

# Compare ASR quality between original and sister language transcription
print("Original ASR vs Sister Language ASR:")
for i in range(min(5, len(fa_results_df))):
    print(f"\\nSegment {i}:")
    print(f"Original:  {fa_results_df.iloc[i]['asr_transcription']}")
    print(f"Sister:    {fa_results_sister_df.iloc[i]['asr_transcription']}")
    print(f"Expected:  {fa_results_df.iloc[i]['text']}")
"""

print("Sister language transcription example above - uncomment to use!")


In [None]:
# Optionally upload results to Hugging Face

hf_dataset = "sil-ai/nya_jhn1_2"
dotenv.load_dotenv()
HF_TOKEN = os.getenv("HF_TOKEN")

ds = Dataset.from_pandas(fa_results_df) 
ds = ds.cast_column("filename", Audio(sampling_rate=16_000))
ds = ds.rename_column("filename", "audio")
sample = ds[1]
print(sample["text"])
print(sample["audio"]["sampling_rate"], len(sample["audio"]["array"]))
ds.push_to_hub(hf_dataset, private=True, token=HF_TOKEN)