In [21]:
import os
import requests
import pandas as pd
from typing import List, Dict
import json
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm

def download_file(url: str, local_path: str) -> None:
    """Download a file from a given URL."""
    if not os.path.exists(local_path):
        response = requests.get(url)
        if response.status_code == 200:
            with open(local_path, "wb") as f:
                f.write(response.content)
            print(f"Downloaded: {local_path}")
        else:
            print(f"Failed to download: {local_path}")
    else:
        print(f"File already exists: {local_path}")
def download_ebible_file(language_code: str, file_suffix: str = "") -> str:
    """Download an eBible corpus file if it doesn't exist."""
    base_url = "https://raw.githubusercontent.com/BibleNLP/ebible/main/corpus/"
    filename = f"{language_code}{file_suffix}.txt"
    url = base_url + filename
    
    local_path = os.path.join("bible_sources", filename)
    os.makedirs("bible_sources", exist_ok=True)
    
    if os.path.exists(local_path):
        print(f"File already exists: {local_path}")
    else:
        download_file(url, local_path)
    return local_path

def download_vref_file() -> str:
    """Download the vref.txt metadata file if it doesn't exist."""
    vref_url = "https://raw.githubusercontent.com/BibleNLP/ebible/main/metadata/vref.txt"
    vref_path = os.path.join("bible_sources", "vref.txt")
    
    if os.path.exists(vref_path):
        print(f"File already exists: {vref_path}")
    else:
        download_file(vref_url, vref_path)
    return vref_path

def download_macula_files():
    """Download Macula Greek and Hebrew files if they don't exist."""
    os.makedirs("bible_sources", exist_ok=True)
    
    hebrew_url = "https://github.com/Clear-Bible/macula-hebrew/raw/main/WLC/tsv/macula-hebrew.tsv"
    greek_url = "https://github.com/Clear-Bible/macula-greek/raw/main/Nestle1904/tsv/macula-greek-Nestle1904.tsv"
    
    hebrew_path = os.path.join("bible_sources", "macula-hebrew.tsv")
    greek_path = os.path.join("bible_sources", "macula-greek.tsv")
    
    download_file(hebrew_url, hebrew_path)
    download_file(greek_url, greek_path)

def load_bible_content(file_path: str, vref_path: str) -> Dict[str, str]:
    """Load a Bible file into a dictionary, using vref.txt for references."""
    with open(vref_path, 'r', encoding='utf-8') as vref_file:
        vrefs = [line.strip() for line in vref_file]
    
    with open(file_path, 'r', encoding='utf-8') as bible_file:
        verses = [line.strip() for line in bible_file]
    
    return dict(zip(vrefs, verses))

def load_vref_content(vref_path: str) -> List[str]:
    """Load vref file into a list."""
    with open(vref_path, 'r', encoding='utf-8') as f:
        return [line.strip() for line in f]

def load_macula_content() -> Dict[str, str]:
    """Load and combine the Macula Greek and Hebrew content."""
    hebrew_path = os.path.join("bible_sources", "macula-hebrew.tsv")
    greek_path = os.path.join("bible_sources", "macula-greek.tsv")
    
    macula_content = {}
    
    for path in [hebrew_path, greek_path]:
        df = pd.read_csv(path, sep="\t", usecols=['ref', 'text'])
        for _, row in df.iterrows():
            vref = row['ref'].split('!')[0]
            macula_content[vref] = macula_content.get(vref, '') + ' ' + str(row['text'])
    
    return macula_content

def align_verses(bibles: List[Dict[str, str]], macula_content: Dict[str, str], vrefs: List[str]) -> Dict[str, Dict[str, str]]:
    """Align verses from multiple Bible translations with Macula."""
    aligned_verses = {}
    
    for vref in tqdm(vrefs, desc="Aligning verses"):
        verse_data = {"source": macula_content.get(vref, "")}
        
        for i, bible in enumerate(bibles):
            lang_code = f"lang_{i}"
            verse_data[lang_code] = bible.get(vref, "")
        
        if any(verse_data.values()):  # Only include verses that have content in at least one version
            aligned_verses[vref] = verse_data
    
    return aligned_verses

def process_ebible_files(file_list: List[str]) -> None:
    """Process a list of eBible files and create aligned JSON output."""
    # Download Macula files
    download_macula_files()
    
    # Download vref file
    vref_path = download_vref_file()
    
    # Load vref content
    vrefs = load_vref_content(vref_path)
    
    # Load Macula content
    macula_content = load_macula_content()
    
    # Download and load Bible content in parallel
    with ThreadPoolExecutor(max_workers=len(file_list)) as executor:
        future_to_file = {executor.submit(download_ebible_file, file_info): file_info for file_info in file_list}
        bibles = []
        for future in as_completed(future_to_file):
            file_path = future.result()
            if file_path:
                bibles.append(load_bible_content(file_path, vref_path))
    
    if not bibles:
        print("No Bible files were successfully loaded.")
        return
    
    aligned_verses = align_verses(bibles, macula_content, vrefs)
    
    # Save aligned verses to JSON
    all_files_label = "_".join(file_list)
    output_file = f"aligned_verses_{all_files_label}.json"
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(aligned_verses, f, ensure_ascii=False, indent=2)
    
    print(f"Aligned verses saved to {output_file}")

# Example usage
ebible_files = ["fra-fraLSG", "fra-fra_fob", "fra-francl", "fra-frasbl"]
process_ebible_files(ebible_files)

In [22]:
# fra-fraLSG.txt

# fra-fra_fob.txt

# fra-francl.txt

# fra-frasbl.txt

# Example usage
ebible_files = ["fra-fraLSG", "fra-fra_fob", "fra-francl", "fra-frasbl"]
process_ebible_files(ebible_files)

File already exists: bible_sources/macula-hebrew.tsv
File already exists: bible_sources/macula-greek.tsv
File already exists: bible_sources/vref.txt
File already exists: bible_sources/fra-fraLSG.txt
File already exists: bible_sources/fra-francl.txt
File already exists: bible_sources/fra-fra_fob.txt
File already exists: bible_sources/fra-frasbl.txt


Aligning verses: 100%|██████████| 41899/41899 [00:00<00:00, 601271.21it/s]


Aligned verses saved to aligned_verses_fra-fraLSG_fra-fra_fob_fra-francl_fra-frasbl.json
