<a href="https://colab.research.google.com/github/r-karra/WoG/blob/main/01_data_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import re
import json
import requests

# 1. Setup URLs and Filenames
SOURCE_URL = "https://raw.githubusercontent.com/r-karra/WoG/main/data/raw/nkjv_bible.txt"

OUTPUT_FILENAME = "kjv_bible_dataset.json"

def convert_bible_to_json():
    print(f"Downloading raw Bible text from {SOURCE_URL}...")
    try:
        response = requests.get(SOURCE_URL)
        response.encoding = 'utf-8-sig' # Handles BOM if present
        raw_text = response.text
    except Exception as e:
        print(f"Failed to download: {e}")
        return

    print("Parsing text... (This may take a few seconds)")

    # 2. Extract the core Bible text
    # We look for the start of Genesis and the end of the ebook
    start_marker = "The First Book of Moses: Called Genesis"
    end_marker = "*** END OF THE PROJECT GUTENBERG EBOOK"

    start_idx = raw_text.find(start_marker)
    end_idx = raw_text.find(end_marker)

    if start_idx == -1:
        print("Error: Could not find start marker. Gutenberg format might have changed.")
        return

    clean_text = raw_text[start_idx:end_idx]
    lines = clean_text.split('\n')

    # 3. Regex Patterns
    # Matches "1:1 In the beginning..."
    verse_pattern = re.compile(r"^(\d+):(\d+)\s+(.+)$")

    bible_data = []
    current_book_obj = None
    current_book_name = ""

    # Standard KJV Books to help identify headers
    known_books = [
        "Genesis", "Exodus", "Leviticus", "Numbers", "Deuteronomy", "Joshua", "Judges", "Ruth",
        "1 Samuel", "2 Samuel", "1 Kings", "2 Kings", "1 Chronicles", "2 Chronicles", "Ezra",
        "Nehemiah", "Esther", "Job", "Psalms", "Proverbs", "Ecclesiastes", "Song of Solomon",
        "Isaiah", "Jeremiah", "Lamentations", "Ezekiel", "Daniel", "Hosea", "Joel", "Amos",
        "Obadiah", "Jonah", "Micah", "Nahum", "Habakkuk", "Zephaniah", "Haggai", "Zechariah",
        "Malachi", "Matthew", "Mark", "Luke", "John", "Acts", "Romans", "1 Corinthians",
        "2 Corinthians", "Galatians", "Ephesians", "Philippians", "Colossians", "1 Thessalonians",
        "2 Thessalonians", "1 Timothy", "2 Timothy", "Titus", "Philemon", "Hebrews", "James",
        "1 Peter", "2 Peter", "1 John", "2 John", "3 John", "Jude", "Revelation"
    ]

    for line in lines:
        line = line.strip()
        if not line: continue

        # A. Check for Verse (Logic: If it starts with Number:Number, it's a verse)
        verse_match = verse_pattern.match(line)
        if verse_match:
            chapter_idx = int(verse_match.group(1)) - 1
            verse_text = verse_match.group(3).strip()

            if current_book_obj:
                # Expand chapters list if needed
                while len(current_book_obj["chapters"]) <= chapter_idx:
                    current_book_obj["chapters"].append([])
                current_book_obj["chapters"][chapter_idx].append(verse_text)
            continue

        # B. Check for Book Title
        # Logic: If line matches a known book name
        for kb in known_books:
            if kb in line and len(line) < 100 and "Testament" not in line:
                # If we found a new book
                if current_book_name != kb:
                    # Save old book
                    if current_book_obj:
                        bible_data.append(current_book_obj)

                    # Start new book
                    current_book_name = kb
                    current_book_obj = {
                        "book": kb,
                        "chapters": []
                    }
                break

    # Save the last book
    if current_book_obj:
        bible_data.append(current_book_obj)

    # 4. Write JSON file
    with open(OUTPUT_FILENAME, 'w', encoding='utf-8') as f:
        json.dump(bible_data, f, indent=4)

    print(f"SUCCESS! Converted {len(bible_data)} books.")
    print(f"Saved as: {OUTPUT_FILENAME}")

# Run the function
convert_bible_to_json()

Downloading raw Bible text from https://raw.githubusercontent.com/r-karra/WoG/main/data/raw/nkjv_bible.txt...
Parsing text... (This may take a few seconds)
SUCCESS! Converted 242 books.
Saved as: kjv_bible_dataset.json


In [3]:
from google.colab import files
files.download('kjv_bible_dataset.json')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>