In [None]:
from dotenv import load_dotenv
import os
import re
from loguru import logger
import boto3
from pathlib import Path
import urllib



load_dotenv("vars.env")

True

In [None]:
def download_file_from_s3(s3_client, bucket_name, s3_key, local_path):
    """Download a single file from S3."""
    os.makedirs(os.path.dirname(local_path), exist_ok=True)
    s3_client.download_file(bucket_name, s3_key, local_path)
    logger.info(f"Downloaded {s3_key} to {local_path}")

def download_folder_from_s3(s3_client, bucket_name, s3_key):
    """Download a single file from S3."""
    # os.makedirs(os.path.dirname(local_folder), exist_ok=True)
    os.makedirs(os.path.dirname(s3_key), exist_ok=True)

    segments = list_s3_files(s3_client, bucket_name, s3_key)
    for segment in segments:
        download_file_from_s3(s3_client, bucket_name, segment,segment)
        logger.info(f"Downloaded {segment} to {s3_key}")
    logger.info(f"End")


def list_s3_files(s3_client, bucket_name, prefix):
    """List all files in an S3 bucket under a given prefix."""
    paginator = s3_client.get_paginator("list_objects_v2")
    files = []
    for page in paginator.paginate(Bucket=bucket_name, Prefix=prefix):
        for obj in page.get("Contents", []):
            files.append(obj["Key"])
    return files






def clean_and_split(text):
    text = re.sub(r'^\d+\s*', '', text)
    for char in ["+", "”", "“", "’", "’”", "*"]:
        text = text.replace(char, "")
    parts = re.split(r'[:;,.?,!]+', text)
    return [part.strip() for part in parts if part.strip()]



def get_audio_paths(folder: str) -> list[str]:
        def extract_number(file_path: str) -> int:
            match = re.search(r"segment_(\d+)", file_path)
            return int(match.group(1)) if match else float("inf")
        audio_paths = list(Path(folder).glob("*.mp3"))
        audio_paths = list(Path(folder).glob("*.mp3"))
        audio_paths = [audio_path.as_posix() for audio_path in audio_paths]
        audio_paths = sorted(audio_paths, key=extract_number)
        return audio_paths


def extract_audio_identifier(url):
    parts = url.strip("/").split("/")
    return urllib.parse.unquote(parts[-2]), int(parts[-1].replace("page_",""))


In [3]:
# Clients configuration
access_key = os.getenv("AWS_ACCESS_KEY_ID")
secret_key = os.getenv("AWS_SECRET_ACCESS_KEY")
endpoint_url = os.getenv("AWS_ENDPOINT_URL_S3")

s3_client = boto3.client(
    "s3",
    aws_access_key_id=access_key,
    aws_secret_access_key=secret_key,
    endpoint_url=endpoint_url,
)

In [None]:
BUCKET_NAME = "moore-collection"
SOURCE_FOLDER = "fasoai-segmented_audios"

In [189]:
from datasets import load_dataset
import pandas as pd

DATA_FILE = "sawadogosalif/MooreFRCollections_BibleOnlyText"
dataset = load_dataset(DATA_FILE, split="train").to_pandas()
dataset[["chapter", "page"]] = dataset["moore_source_url"].apply(
    lambda x: pd.Series(extract_audio_identifier(x))
)


In [None]:
def merge_transcriptions_for_audio(transcription_list, merge_start_index, num_transcriptions_to_merge):
    """
    Fusionne plusieurs transcriptions consécutives pour correspondre à un seul fichier audio 
    et gère le décalage des index en conséquence.

    """
    aligned_transcripts = []
    
    # Traitement des éléments avant l'index de fusion
    for i in range(merge_start_index):
        aligned_transcripts.append(transcription_list[i])
    
    # Fusion des transcriptions spécifiées
    transcriptions_to_merge = transcription_list[merge_start_index:merge_start_index + num_transcriptions_to_merge]
    merged_transcription = " ".join(transcriptions_to_merge)
    
    # Ajouter les transcriptions fusionnées
    aligned_transcripts.append(merged_transcription)
    
    for i in range(merge_start_index + num_transcriptions_to_merge, len(transcription_list)):
        aligned_transcripts.append(transcription_list[i])
    
    return aligned_transcripts


def split_trnascription(transcriptions, split_word, index):

    if index < 0 or index >= len(transcriptions):
        raise IndexError("L'index est hors de portée de la liste.")
    
    element = transcriptions[index]
    pos = element.find(split_word)
    
    if pos == -1:
        return transcriptions
    
    left_part = element[:pos + len(split_word)]
    right_part = element[pos + len(split_word):]
    
    nouvelle_liste = transcriptions[:index] + [left_part, right_part] + transcriptions[index+1:]
    
    return nouvelle_liste


In [None]:
# run the code step by step , 
# break for each element loop

pages= [f"fasoai-segmented_audios/Sɩngre/page_{i}" for i in range(21, 51)]


from tqdm import tqdm
for page in tqdm(pages):
    if os.path.exists(page) and os.path.isdir(page):
        print(f"Folder {page} already exists")
    else:
        download_folder_from_s3(s3_client, BUCKET_NAME, page)
 
    paths = get_audio_paths(page)
    if  "page_1" in pages :
        paths = paths[3:]
    else : 
        paths = paths[2:]
    chapter, page = extract_audio_identifier(page)
    tmp = dataset[(dataset.page==page) & (dataset.chapter==chapter)]
    lists = tmp["moore_verse_text"].apply(clean_and_split).to_list()
    transcriptions = sum(lists, [])
    # transcriptions = [transcript.strip() for transcript in transcriptions] ]
    # transcriptions.remove("*")
    print(len(transcriptions), len(paths))

    if len(transcriptions) == len(paths):
        data_dict = {
        "audio": paths,
        "transcript": transcriptions
    }
        print(chapter, page)
    break

  0%|          | 0/30 [00:00<?, ?it/s]

Folder fasoai-segmented_audios/Sɩngre/page_21 already exists
103 99





In [198]:

# Alignement
parts= []
merge_start_index  = 91 #21 #7 #17 1 #28-3
print(transcriptions[merge_start_index])
transcriptions = merge_transcriptions_for_audio(transcriptions, merge_start_index, 2)
len(transcriptions), len(paths)

Woto


(99, 99)

In [177]:
transcriptions = split_trnascription(transcriptions, "mam rũm-kɩɩmbã", 23)
len(transcriptions), len(paths)

(67, 64)

In [None]:
data = {
    "audio": paths,
    "transcript": transcriptions
}

with open('output/page_21.json', 'w', encoding='utf-8') as f:
    json.dump(data, f, ensure_ascii=False, indent=4)


In [197]:
for i , aligned_transcript  in enumerate(transcriptions):
    print(i+3, aligned_transcript)
    #1 SI TRANSCRIPTION +K ? SGEMENT k-1

3 A Zeova tẽega a Saara yell wa a sẽn da yeelã
4 A Zeova pidsa bũmb ning a sẽn da pʋlem a Saara wɛɛngẽ wã
5 Dẽ a Saara rɩka pʋg n dog bi-ribl n kõ a Abrahaam a kʋʋlem wakate
6 wakat ning Wẽnnaam sẽn da pʋlm-a wã
7 A Abrahaam pʋda a bi-pɛɛlgã a Saara sẽn dog n kõ-a wã ta Izaake
8 A Abrahaam kẽesa a biig a Izaak bãong a sẽn wa n tar rasem a nii
9 wa Wẽnnaam sẽn da yeel-a wã
10 B sẽn wa n na n dog a Abrahaam biig a Izaakã ta Abrahaam tara yʋʋm koabga
11 Dẽ
12 a Saara yeelame yaa
13 Wẽnnaam kɩtame tɩ mam la ne sũ-noogo
14 Ned ning fãa sẽn na n wʋm yãoã na n laa ne maam
15 A paasame
16 Ãnda n da na n yeel a Abrahaam ta ‘Saara na n yẽsma kamba
17 Baa ne rẽ
18 mam doga bi-ribl n kõ-a a kʋʋlem wakate
19 Biigã bɩɩme
20 n bas bĩisim
21 A Abrahaam maana tig-kãseng daar ning a Izaak sẽn bas bĩismã
22 La a Saara ra neeme tɩ Ezɩpt ned a Agaar sẽn dog biig ning n kõ a Abrahaamã ra yaanda a Izaake
23 Rẽ n so ta yeel a Abrahaam yaa
24 Rig yem-poakã ne a biigã
25 bala yem-pog-kãngã biigã pa na n naag ma