In [None]:
import kagglehub

# Download latest version and save in the current folder
path = kagglehub.dataset_download("phyred23/bibleverses", path="bible_data_set.csv")

# This will save the dataset in an obscure folder. You will need to find it and move it to the current folder
print("Path to dataset files:", path)

In [1]:
import requests
import csv
from tqdm import tqdm

# List of all books and number of chapters per book in the Bible
books_of_bible = {
    "Genesis": 50,
    "Exodus": 40,
    "Leviticus": 27,
    "Numbers": 36,
    "Deuteronomy": 34,
    "Joshua": 24,
    "Judges": 21,
    "Ruth": 4,
    "1 Samuel": 31,
    "2 Samuel": 24,
    "1 Kings": 22,
    "2 Kings": 25,
    "1 Chronicles": 29,
    "2 Chronicles": 36,
    "Ezra": 10,
    "Nehemiah": 13,
    "Esther": 10,
    "Job": 42,
    "Psalms": 150,
    "Proverbs": 31,
    "Ecclesiastes": 12,
    "Song of Solomon": 8,
    "Isaiah": 66,
    "Jeremiah": 52,
    "Lamentations": 5,
    "Ezekiel": 48,
    "Daniel": 12,
    "Hosea": 14,
    "Joel": 3,
    "Amos": 9,
    "Obadiah": 1,
    "Jonah": 4,
    "Micah": 7,
    "Nahum": 3,
    "Habakkuk": 3,
    "Zephaniah": 3,
    "Haggai": 2,
    "Zechariah": 14,
    "Malachi": 4,
    "Matthew": 28,
    "Mark": 16,
    "Luke": 24,
    "John": 21,
    "Acts": 28,
    "Romans": 16,
    "1 Corinthians": 16,
    "2 Corinthians": 13,
    "Galatians": 6,
    "Ephesians": 6,
    "Philippians": 4,
    "Colossians": 4,
    "1 Thessalonians": 5,
    "2 Thessalonians": 3,
    "1 Timothy": 6,
    "2 Timothy": 4,
    "Titus": 3,
    "Philemon": 1,
    "Hebrews": 13,
    "James": 5,
    "1 Peter": 5,
    "2 Peter": 3,
    "1 John": 5,
    "2 John": 1,
    "3 John": 1,
    "Jude": 1,
    "Revelation": 22
}

# Get the WEB (World English Bible) translation
# Change to 'esv' or 'niv' if needed
version = "web"

# API base URL for a modern translation (e.g., ESV or NIV)
base_url = "https://bible-api.com/"

# File name to save results
csv_filename = "data/web.csv"

# Open CSV for writing
with open(csv_filename, mode="w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)
    writer.writerow(["book", "chapter", "verse", "modern_text"])

    # Initialize progress bar
    total_chapters = sum(books_of_bible.values())
    with tqdm(total=total_chapters, desc="Processing Bible Chapters", unit="chapter") as pbar:
        # Loop through books and chapters
        for book, num_chapters in books_of_bible.items():
            for chapter in range(1, num_chapters + 1):
                url = f"{base_url}{book}+{chapter}?translation={version}"

                response = requests.get(url)
                if response.status_code == 200:
                    data = response.json()
                    for verse in data["verses"]:
                        # Clean up the text by removing extra spaces and newlines
                        cleaned_text = " ".join(verse["text"].split())
                        writer.writerow([book, verse["chapter"], verse["verse"], cleaned_text])
                pbar.update(1)

print(f"🎉 Done! Saved modern translation to {csv_filename}")

Processing Bible Chapters: 100%|██████████| 1189/1189 [03:33<00:00,  5.57chapter/s]

🎉 Done! Saved modern translation to data/web.csv





In [2]:
import pandas as pd

# Load KJV and modern translations
kjv_df = pd.read_csv("data/kjv.csv")
modern_df = pd.read_csv("data/web.csv")

# Merge by book, chapter, and verse
merged_df = pd.merge(kjv_df, modern_df, on=["book", "chapter", "verse"], suffixes=("_kjv", "_modern"))

# Select columns for training
merged_df = merged_df[["modern_text", "text"]]
merged_df.columns = ["modern_text", "kjv_text"]

merged_df.to_csv("data/web_to_kjv.csv", index=False)

print("✅ Merged dataset saved as modern_to_kjv.csv!")

✅ Merged dataset saved as modern_to_kjv.csv!
