This notebook processes Bambara Bible audio–text pairs extracted from Bible.is for speech and NLP experimentation. 

We load the scraped chapter data, verifies verse-level alignment, and prepares the material for downstream modeling and analysis.

**Data Content:**
The extraction includes:

* Bambara Bible audio recordings
* Corresponding aligned text (verse-level)

**Usage Context:**
According to Bible.is terms, the content may be used for non-profit, personal, study, or research purposes. The platform explicitly disallows redistribution of the content, or the creation of external services or proxies replicating their materials.

**Purpose of Extraction:**
The audio-text pairs are used exclusively for research in Bambara Natural Language Processing (NLP), primarily for:

* Text-to-Speech (TTS)
* Automatic Speech Recognition (ASR)
* Audio-Text alignment
* Language modeling & data analysis

**Non-Commercial Intent:**
This material is used solely for private academic experimentation. No content (audio or text) from Bible.is will be redistributed, rehosted, resold, or transformed into a public-facing dataset or service.

**Ethical Notes:**
The research respects the platform’s non-profit usage constraints and avoids redistribution. Only model performance metrics, analysis, or high-level results may be shared publicly without exposing the original media.





In [None]:
import asyncio
import json
import logging
import os
import random
from dataclasses import dataclass
from pathlib import Path
from typing import List, Dict, Optional
from urllib.parse import urljoin, urlparse

import aiofiles
import aiohttp
from playwright.async_api import async_playwright, Locator, Page, Browser
from tenacity import retry, stop_after_attempt, wait_exponential, wait_random
from tqdm.asyncio import tqdm

In [None]:

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

@dataclass
class VerseData:
    """Data structure for storing verse information."""
    verse_id: str
    text: str

@dataclass
class ChapterData:
    """Data structure for storing chapter information."""
    audio_path: str
    non_drama: bool
    verses: List[VerseData]

In [None]:
class BibleScraper:

    
    USER_AGENTS = [
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15",
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0",
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36",
        "Mozilla/5.0 (iPhone; CPU iPhone OS 14_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Mobile/15E148 Safari/604.1"
    ]
    
    def __init__(self, base_url: str, output_dir: str, max_workers: int = 2, delay_min: float = 5.0, delay_max: float = 12.0):
        """
        Initialize the scraper with configuration parameters.
        
        Args:
            base_url (str): The base URL to start scraping from
            output_dir (str): Directory to store downloaded files
            max_workers (int): Maximum number of concurrent workers
            delay_min (float): Minimum delay between requests in seconds
            delay_max (float): Maximum delay between requests in seconds
        """
        self.base_url = base_url
        self.output_dir = Path(output_dir)
        self.audio_dir = self.output_dir / "bible_supyire" / "audio"
        self.json_dir = self.output_dir / "bible_supyire" / "json"
        self.semaphore = asyncio.Semaphore(max_workers)
        self.delay_min = delay_min
        self.delay_max = delay_max
        

        parsed_url = urlparse(base_url)
        path_parts = parsed_url.path.split('/')
        self.language_code = path_parts[2] if len(path_parts) > 2 else None
        

        self.audio_dir.mkdir(parents=True, exist_ok=True)
        self.json_dir.mkdir(parents=True, exist_ok=True)

    async def _random_delay(self):
        """Add a random delay to avoid detection."""
        delay = random.uniform(self.delay_min, self.delay_max)
        await asyncio.sleep(delay)

    def _get_random_user_agent(self):
        """Get a random user agent from the list."""
        return random.choice(self.USER_AGENTS)

    @retry(
        stop=stop_after_attempt(3),
        wait=wait_exponential(multiplier=1, min=2, max=5) + wait_random(0, 2)
    )
    async def _get_chapter_urls(self, page: Page) -> List[str]:
        logger.info("Collecting chapter URLs...")
        

        await page.locator("#chapter-dropdown-button").click()
        await asyncio.sleep(2)  
        

        chapter_elements = await page.locator(".chapter-container a.chapter-box").all()
        urls = []
        for element in chapter_elements:
            href = await element.get_attribute("href")
            if href:
         
                if self.language_code and self.language_code in href:
                    urls.append(urljoin(self.base_url, href))
        
        logger.info(f"Found {len(urls)} chapter URLs for language code {self.language_code}")
        return urls

    @retry(
        stop=stop_after_attempt(3),
        wait=wait_exponential(multiplier=1, min=2, max=5) + wait_random(0, 2)
    )
    async def _download_audio(self, url: str, audio_src: str, filename: str) -> str:
        output_path = self.audio_dir / filename
        
        if output_path.exists():
            return str(output_path)
        
        headers = {
            "User-Agent": self._get_random_user_agent(),
            "Referer": url,
            "Accept": "audio/webm,audio/ogg,audio/mp4,audio/*;q=0.9,application/ogg;q=0.7,video/*;q=0.6,*/*;q=0.5",
            "Accept-Language": "en-US,en;q=0.5",
            "Accept-Encoding": "gzip, deflate, br",
            "Connection": "keep-alive",
            "DNT": "1",
        }
        
        async with aiohttp.ClientSession() as session:
            async with session.get(audio_src, headers=headers) as response:
                if response.status == 200:
                    async with aiofiles.open(output_path, 'wb') as f:
                        await f.write(await response.read())
                else:
                    raise Exception(f"Failed to download audio: {response.status}")
        
        return str(output_path)

    async def _process_chapter(self, browser: Browser, url: str) -> Optional[Dict]:
        if self.language_code and self.language_code not in url:
            logger.info(f"Skipping {url} - not matching our language code {self.language_code}")
            return None
            
        json_filename = f"{url.split('/')[-2]}_{url.split('/')[-1]}.json"
        json_path = self.json_dir / json_filename
        
        if json_path.exists():
            logger.info(f"Skipping {url} - already processed")
            return None

        page = None
        context = None
        
        try:
            context = await browser.new_context(
                user_agent=self._get_random_user_agent(),
                viewport={"width": random.randint(1200, 1600), "height": random.randint(800, 1000)},
                locale="en-US"
            )
            

            await context.set_extra_http_headers({
                "Accept-Language": "en-US,en;q=0.9",
                "DNT": "1",
            })
            
            page = await context.new_page()
            
            await page.goto(url, wait_until="networkidle")
            

            await page.evaluate("""
                window.scrollTo({
                    top: Math.random() * 100,
                    behavior: 'smooth'
                });
            """)
            
            await self._random_delay()  


            audio_player = page.locator("video.audio-player")
            if not await audio_player.count():
                logger.warning(f"No audio player found for {url}")
                return None
            

            audio_src = await audio_player.get_attribute("src")
            if not audio_src or audio_src == "_":
                logger.warning(f"No audio source for {url}")
                return None
            
            non_drama = False
            audio_drama_toggle = page.locator(".audio-drama-toggle-container #non-drama-button:not(.disabled)")
            if await audio_drama_toggle.count() > 0:
                await audio_drama_toggle.click()
                await self._random_delay()  
                non_drama = True
            
            audio_player = page.locator("video.audio-player")
            audio_src = await audio_player.get_attribute("src")
            if not audio_src or audio_src == "_":
                logger.warning(f"No audio source for {url}")
                return None
            
            filename = f"{url.split('/')[-2]}_{url.split('/')[-1]}.mp3"
            audio_path = await self._download_audio(url, audio_src, filename)
            
            await self._random_delay()
            
            verses = []
            verse_elements = await page.locator(".main-wrapper .chapter span.align-left span[data-verseid]").all()
            
            for verse_elem in verse_elements:
                verse_id = await verse_elem.get_attribute("data-verseid")
                text = await verse_elem.text_content()
                verses.append(VerseData(verse_id=verse_id, text=text.strip()))
            
            data = {
                "audio_path": audio_path,
                "non_drama": non_drama,
                "verses": [{"verse_id": v.verse_id, "text": v.text} for v in verses]
            }
            
            async with aiofiles.open(json_path, 'w') as f:
                await f.write(json.dumps(data, ensure_ascii=False, indent=2))
            
            return data
            
        except Exception as e:
            logger.error(f"Error processing {url}: {str(e)}")
            return None
            
        finally:
            if page:
                await page.close()
            if context:
                await context.close()

            await self._random_delay()

    async def scrape(self) -> None:

        async with async_playwright() as p:
            browser = await p.chromium.launch(
                headless=True
            )
            

            context = await browser.new_context(
                user_agent=self._get_random_user_agent(),
                viewport={"width": random.randint(1200, 1600), "height": random.randint(800, 1000)}
            )
            page = await context.new_page()
            
            try:
                
                await page.goto(self.base_url, wait_until="networkidle")
                await self._random_delay()  # Wait before doing anything
                chapter_urls = await self._get_chapter_urls(page)
                
                
                random.shuffle(chapter_urls)
                
            finally:
                await page.close()
                await context.close()
            

            tasks = []
            for url in chapter_urls:
                tasks.append(self._process_chapter_with_semaphore(browser, url))
            
            results = await tqdm.gather(*tasks, desc="Processing chapters")
            
            await self._combine_json_files()
            
            await browser.close()

    async def _process_chapter_with_semaphore(self, browser, url: str) -> Optional[Dict]:
        async with self.semaphore:
            return await self._process_chapter(browser, url)

    async def _combine_json_files(self) -> None:
        combined_data = []
        
        for json_file in self.json_dir.glob("*.json"):
            async with aiofiles.open(json_file) as f:
                content = await f.read()
                combined_data.append(json.loads(content))
        
        combined_output = self.output_dir / "combined_bible_data.json"
        async with aiofiles.open(combined_output, 'w') as f:
            await f.write(json.dumps(combined_data, ensure_ascii=False, indent=2))

In [None]:
async def run_scraper(base_url="https://live.bible.is/bible/SPPTBL/MAT/1", 
                     output_dir="supyire/data", 
                     max_workers=2,
                     delay_min=5.0,
                     delay_max=12.0):
    """
    Run the scraper with the specified parameters.
    
    Args:
        base_url (str): The base URL to start scraping from
        output_dir (str): Directory to store output files
        max_workers (int): Maximum number of concurrent workers
        delay_min (float): Minimum delay between requests in seconds
        delay_max (float): Maximum delay between requests in seconds
    """
    scraper = BibleScraper(
        base_url=base_url,
        output_dir=output_dir,
        max_workers=max_workers,
        delay_min=delay_min,
        delay_max=delay_max
    )
    
    await scraper.scrape()

In [46]:
import asyncio
await run_scraper()

2025-03-02 20:50:38,421 - INFO - Collecting chapter URLs...
2025-03-02 20:50:45,651 - INFO - Found 259 chapter URLs for language code SPPTBL
Processing chapters: 100%|██████████| 259/259 [1:03:02<00:00, 14.60s/it]


In [50]:
print("Hello")

Hello



## **Segmentation / Audio–Text Alignment Description**

For verse-level speech processing, the Bambara audio and text pairs were temporally aligned using the **TimestampAudio CLI** tool from Waha ([https://github.com/waha-team/waha-ai-timestamper-cli](https://github.com/waha-team/waha-ai-timestamper-cli)).
The tool uses Meta’s **MMS ASR** model to generate timestamp boundaries between the audio signal and the corresponding text spans.

### **Purpose**

The timestamping step provides aligned segments that are required for:

* Supervised ASR training (audio ↔ text supervision)
* TTS duration modeling
* Dataset inspection and forced alignment analysis
* Verse-level slicing (optional)
* Time-based quality evaluation

### **Method**

* Audio files (`.mp3`) and matching text files (`.txt`) were placed in a directory
* The tool processed each matched pair
* Outputs included:

  * **JSON** files with timestamp metadata
  * **SRT** subtitle files for visual inspection

### **Alignment Strategy**

Alignment was performed at verse granularity, using:

* MMS ASR multilingual models
* Automatic language recognition (no manual language override was required for Bambara)
* Default silence-handling settings






## **Audio Splitting After Segmentation**

After performing verse-level timestamping with the Waha TimestampAudio tool, the extracted audio segments may vary in length. Some segments can be too long for stable model training, while others may be very short. To standardize segment duration and improve usability for NLP/ASR/TTS tasks, we perform **post-segmentation splitting and extraction** based on the timestamp JSON.

### **Purpose of Splitting**

* Ensure segments are **within a target duration range** (e.g., 1–30 seconds)
* Avoid overlong audio that may hinder ASR or TTS training
* Respect natural pauses in speech to **preserve intelligibility**
* Generate segments suitable for **batch processing and model ingestion**



- **Input**: The JSON file generated by the timestamping tool, containing audio filenames, verse-level timings, and corresponding text.
- **Segment Extraction**: Using `pydub`, each verse’s start and end times are used to extract a `.wav` file segment.


   * Each segment is saved as a separate `.wav` file.
   * A new JSON file is generated, containing the following metadata for each segment:

```json
[
  {
    "audio_filepath": "segments/audio_168.1.wav",
    "text": "bambaera  text here",
    "duration": 3.72
  },
  {
    "audio_filepath": "segments/audio_168.2.wav",
    "text": "bambara text here",
    "duration": 22.38
  }
]
```

### **Result**

* Uniform, intelligible segments aligned with text
* Ready for **ASR/TTS model training**, evaluation, or further preprocessing
* Progress tracking ensures **efficient batch processing** over large datasets
* Optional integration with normalization, phonetic alignment, or forced-alignment pipelines




In [None]:
import json
from pathlib import Path
from pydub import AudioSegment
from tqdm import tqdm

def create_segments_from_json(json_path: str, audio_dir: str, output_dir: str):
    """
    Extract segments from audio based on Waha timestamp JSON and export them as WAV files
    with a progress bar.

    Args:
        json_path (str): Path to the segmentation JSON file
        audio_dir (str): Directory containing the original audio files
        output_dir (str): Directory to save extracted segments and output JSON

    Returns:
        output_json_path (str): Path to the JSON file containing all segment info
    """

    json_path = Path(json_path)
    audio_dir = Path(audio_dir)
    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)

    with open(json_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    segments_info = []

    for entry in tqdm(data, desc="Processing audio files", unit="file"):
        audio_file = audio_dir / entry["audio_file"]
        if not audio_file.exists():
            print(f"Warning: Audio file {audio_file} not found. Skipping.")
            continue

        sections = entry.get("sections", [])

        for section in tqdm(sections, desc=f"Processing sections in {entry['audio_file']}", leave=False, unit="segment"):
            start_sec, end_sec = section["timings"]
            text = section["text"]

            segment_filename = f"{section['verse_id']}.wav"
            segment_path = output_dir / segment_filename

            audio = AudioSegment.from_file(audio_file)
            start_ms = int(start_sec * 1000)
            end_ms = int(end_sec * 1000)
            segment_audio = audio[start_ms:end_ms]
            segment_audio.export(segment_path, format="wav")

            segments_info.append({
                "audio_filepath": str(segment_path),
                "text": text,
                "duration": len(segment_audio) / 1000.0
            })

    output_json_path = output_dir / f"{json_path.stem}_segments.json"
    with open(output_json_path, "w", encoding="utf-8") as f:
        json.dump(segments_info, f, ensure_ascii=False, indent=2)

    print(f"Exported {len(segments_info)} segments to {output_json_path}")
    return str(output_json_path)


In [None]:
output_json = create_segments_from_json(
    json_path="segments_timestamp.json",
    audio_dir="audio/",
    output_dir="segments/"
)

with open(output_json, "r") as f:
    segments = json.load(f)
    
print(segments[:2])
