In [None]:
from dotenv import load_dotenv
import os
import re
import urllib
from loguru import logger
import pandas as pd
from pathlib import Path


import pandas as pd
from pydub import AudioSegment
import openpyxl
from tqdm import tqdm
from datasets import Dataset, Audio, Features, Value, concatenate_datasets

load_dotenv("vars.env")

True

In [None]:
def extract_audio_identifier(url):
    parts = url.strip("/").split("/")
    return urllib.parse.unquote(parts[-2]), int(parts[-1].replace("page_",""))

def remove_digits_and_numbers(text):
  """
  Removes digits and numbers from the beginning of a string and keeps the remaining text.
  """
  match = re.match(r'^[\d\s]+', text)
  if match:
    return text[match.end():].strip()
  else:
    return text
  

def time_to_milliseconds(time_str):
    """Converts time string (MM:SS or HH:MM:SS) to milliseconds."""
    try:
        parts = time_str.split(":")
        if len(parts) == 2:  # MM:SS format
            minutes, seconds = map(int, parts)
            return (minutes * 60 + seconds) * 1000
        elif len(parts) == 3:  # HH:MM:SS format
            minutes, seconds, milli_second = map(int, parts)
            return (minutes * 60 + seconds  + milli_second/10000) * 1000
        else:
            print(f"Error: Invalid time format '{time_str}'")
            return 0  # Return 0 milliseconds if invalid format
    except ValueError as e:
        print(f"Error parsing time string '{time_str}': {e}")
        return 0  
    


def get_audio_paths(folder: str) -> list[str]:
    def extract_number(file_path: str) -> int:
        match = re.search(r"segment_(\d+)", file_path)
        return int(match.group(1)) if match else float("inf")
    audio_paths = list(Path(folder).glob("*.mp3"))
    audio_paths = list(Path(folder).glob("*.mp3"))
    audio_paths = [audio_path.as_posix() for audio_path in audio_paths]
    audio_paths = sorted(audio_paths, key=extract_number)
    return audio_paths


In [195]:

def segment_audio_from_excel(chapter:str, excel_file:str, audio_file:str, dataset, output_folder="segments"):
    """
    Segments an audio file based on timestamps from an Excel file with multiple sheets.
    """

    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    results= []

    logger.info("read audios")

    wb = openpyxl.load_workbook(excel_file)
    audio = AudioSegment.from_file(audio_file)
  
    for sheet_name in tqdm(wb.sheetnames):
        logger.info(f"start processing sheet {sheet_name}")
        df = pd.read_excel(excel_file, sheet_name=sheet_name)
        os.makedirs(f"{output_folder}/{sheet_name}", exist_ok=True)
        for index, row in df.iterrows():
            start_time = row["debut partie"]
            end_time = row["fin partie"]
            start_ms = time_to_milliseconds(str(start_time))
            end_ms = time_to_milliseconds(str(end_time))
            segment = audio[start_ms:end_ms]
            filename = f"{output_folder}/{sheet_name}/segment_{index + 1}.mp3"
            segment.export(filename, format="mp3")
            print(f"Segment saved: {filename}")
        page_id = int(sheet_name.replace("page_", ""))
        segments = get_audio_paths(f"{output_folder}/{sheet_name}")
        sub_transcription_df = dataset[(dataset.page==page_id) & (dataset.chapter==chapter) ]
        transcriptions = get_matches(df, sub_transcription_df)
        audio_sequence = list(range(1, len(transcriptions) + 1))

        results.append({
            "audio":segments,
            "transcript":transcriptions,
            "page":[page_id]*len(segments),
            "audio_sequence":audio_sequence
        })
    return results



def get_verse_id(verse_number, base_id="v1041"):
    return f"{base_id}{verse_number:03d}"

def get_matches(segment_data, transcription_df):
    base_id = transcription_df.verse_id.iloc[0]
    
    base_id = base_id[:-4] if len(base_id)==9 else base_id[:-3]
    try:
        segment_data = segment_data[["debut verset", "fin verset"]].copy()  # Use .copy() to avoid SettingWithCopyWarning
        segment_data.loc[:, "fin verset"] = segment_data["fin verset"].apply(lambda x: get_verse_id(x, base_id))
        segment_data.loc[:, 'debut verset'] = segment_data['debut verset'].apply(lambda x: get_verse_id(x, base_id)) #add this line
    except Exception as e:
        raise e

    segment_data.loc[:, 'debut_num'] = segment_data['debut verset'].str[1:].astype(int)
    segment_data.loc[:, 'fin_num'] = segment_data['fin verset'].str[1:].astype(int)

    transcription_df.loc[:, 'verse_num'] = transcription_df['verse_id'].str[1:].astype(int)

    cross_df = segment_data.assign(key=1).merge(transcription_df.assign(key=1), on='key').drop('key', axis=1)

    filtered_df = cross_df[
        (cross_df['verse_num'] >= cross_df['debut_num']) &
        (cross_df['verse_num'] <= cross_df['fin_num'])
    ]

    result_df = filtered_df.groupby(
        ['debut verset', 'fin verset']
    ).agg({
        'moore_verse_text': lambda x: ' '.join(x)
    }).reset_index()

    return result_df["moore_verse_text"].to_list()

In [None]:
excel_file = "./Sɩngre.xlsx"
audio_file = "./nwt_01_Ge_MM_41.mp3"


DATA_FILE = "sawadogosalif/MooreFRCollections_BibleOnlyText"
dataset = load_dataset(DATA_FILE, split="train").to_pandas()
dataset[["chapter", "page"]] = dataset["moore_source_url"].apply(
    lambda x: pd.Series(extract_audio_identifier(x))
)
dataset["moore_verse_text"] = dataset["moore_verse_text"].apply(remove_digits_and_numbers)



chapter= "Sɩngre"
results = segment_audio_from_excel(chapter, excel_file, audio_file, dataset, output_folder="segments")


dataset_list = []


for result in tqdm(results):
    dataset = Dataset.from_dict(result)
    features = Features({
            "audio": Audio(sampling_rate=48000),
            "transcript": Value("string"),
            "page": Value("string"),
            "audio_sequence": Value("string")
        })
    dataset = dataset.cast(features)

    durations  = []
    for example in dataset:
        audio_path = example["audio"]['path']
        audio = AudioSegment.from_file(audio_path)
        duration = len(audio) / 1000.0  # Convertir
        durations.append(duration)
    dataset = dataset.add_column("duration", durations)
    dataset_list.append(dataset)
datasets = concatenate_datasets(dataset_list)

[32m2025-03-21 23:28:39.552[0m | [1mINFO    [0m | [36m__main__[0m:[36msegment_audio_from_excel[0m:[36m17[0m - [1mread audios[0m
  0%|          | 0/2 [00:00<?, ?it/s][32m2025-03-21 23:28:43.287[0m | [1mINFO    [0m | [36m__main__[0m:[36msegment_audio_from_excel[0m:[36m23[0m - [1mstart processing sheet page_41[0m


Segment saved: segments/page_41/segment_1.mp3
Segment saved: segments/page_41/segment_2.mp3
Segment saved: segments/page_41/segment_3.mp3
Segment saved: segments/page_41/segment_4.mp3
Segment saved: segments/page_41/segment_5.mp3
Segment saved: segments/page_41/segment_6.mp3
Segment saved: segments/page_41/segment_7.mp3
Segment saved: segments/page_41/segment_8.mp3
Segment saved: segments/page_41/segment_9.mp3
Segment saved: segments/page_41/segment_10.mp3
Segment saved: segments/page_41/segment_11.mp3
Segment saved: segments/page_41/segment_12.mp3
Segment saved: segments/page_41/segment_13.mp3
Segment saved: segments/page_41/segment_14.mp3
Segment saved: segments/page_41/segment_15.mp3
Segment saved: segments/page_41/segment_16.mp3
Segment saved: segments/page_41/segment_17.mp3
Segment saved: segments/page_41/segment_18.mp3
Segment saved: segments/page_41/segment_19.mp3
Segment saved: segments/page_41/segment_20.mp3
Segment saved: segments/page_41/segment_21.mp3
Segment saved: segment

 'v1041018' 'v1041021' 'v1041023' 'v1041025' 'v1041027' 'v1041029'
 'v1041030' 'v1041032' 'v1041034' 'v1041036' 'v1041038' 'v1041040'
 'v1041042' 'v1041043' 'v1041045' 'v1041047' 'v1041049' 'v1041051'
 'v1041053' 'v1041055' 'v1041057']' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  segment_data.loc[:, "fin verset"] = segment_data["fin verset"].apply(lambda x: get_verse_id(x, base_id))
 'v1041017' 'v1041019' 'v1041022' 'v1041024' 'v1041026' 'v1041028'
 'v1041029' 'v1041031' 'v1041033' 'v1041035' 'v1041037' 'v1041039'
 'v1041041' 'v1041043' 'v1041044' 'v1041046' 'v1041048' 'v1041050'
 'v1041052' 'v1041054' 'v1041056']' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  segment_data.loc[:, 'debut verset'] = segment_data['debut verset'].apply(lambda x: get_verse_id(x, base_id)) #add this line
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value inst

Segment saved: segments/page_41/segment_27.mp3
Segment saved: segments/page_42/segment_1.mp3
Segment saved: segments/page_42/segment_2.mp3
Segment saved: segments/page_42/segment_3.mp3
Segment saved: segments/page_42/segment_4.mp3
Segment saved: segments/page_42/segment_5.mp3
Segment saved: segments/page_42/segment_6.mp3
Segment saved: segments/page_42/segment_7.mp3
Segment saved: segments/page_42/segment_8.mp3
Segment saved: segments/page_42/segment_9.mp3
Segment saved: segments/page_42/segment_10.mp3
Segment saved: segments/page_42/segment_11.mp3
Segment saved: segments/page_42/segment_12.mp3
Segment saved: segments/page_42/segment_13.mp3
Segment saved: segments/page_42/segment_14.mp3
Segment saved: segments/page_42/segment_15.mp3


 'v1042018' 'v1042021' 'v1042023' 'v1042025' 'v1042028' 'v1042032'
 'v1042034' 'v1042036' 'v1042037' 'v1042038']' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  segment_data.loc[:, "fin verset"] = segment_data["fin verset"].apply(lambda x: get_verse_id(x, base_id))
 'v1042017' 'v1042019' 'v1042022' 'v1042024' 'v1042026' 'v1042029'
 'v1042033' 'v1042035' 'v1042037' 'v1042038']' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  segment_data.loc[:, 'debut verset'] = segment_data['debut verset'].apply(lambda x: get_verse_id(x, base_id)) #add this line
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  transcription_df.loc[:, 'verse_num'] = transcription_df['verse_id'].str[1:].astype(int)
100%|███████

Segment saved: segments/page_42/segment_16.mp3


Casting the dataset: 100%|██████████| 27/27 [00:00<00:00, 574.47 examples/s]
Casting the dataset: 100%|██████████| 16/16 [00:00<00:00, 1594.60 examples/s]
100%|██████████| 2/2 [00:14<00:00,  7.18s/it]


In [194]:
datasets.save_to_disk(f"contribution_dataset_{chapter}")

Saving the dataset (1/1 shards): 100%|██████████| 43/43 [00:00<00:00, 494.25 examples/s]
