## Data Analysis

In [2]:
import re

# OEB

In [3]:
import re

# File paths
input_file = 'OEB.txt'
output_tsv_file = 'OEB_combined.tsv'

# Book names extracted from the table of contents
book_names = [
    "Ruth", "Esther", "Psalms", "Hosea", "Joel", "Amos", "Obadiah", "Jonah", 
    "Micah", "Nahum", "Habakkuk", "Zephaniah", "Haggai", "Zechariah", "Malachi",
    "Matthew", "Mark", "Luke", "John", "Acts", "Romans", "1 Corinthians", 
    "2 Corinthians", "Galatians", "Ephesians", "Philippians", "Colossians", 
    "1 Thessalonians", "2 Thessalonians", "1 Timothy", "2 Timothy", "Titus", 
    "Philemon", "Hebrews", "James", "1 Peter", "2 Peter", "1 John", "2 John", 
    "3 John", "Jude", "Revelation"
]

# Regex pattern to detect verses
verse_pattern = re.compile(r"\[(\d+:\d+)\]\s+(.+?)(?=\[\d+:\d+\]|$)")

# Data collection
combined_data = []
current_book = None

# Use the order of books from the table of contents to infer context
book_index = 0

with open(input_file, 'r', encoding='utf-8') as file:
    for line in file:
        line = line.strip()

        # Detect verses
        for verse_match in verse_pattern.finditer(line):
            verse_id, verse_text = verse_match.groups()
            chapter_num, verse_num = verse_id.split(":")

            # Assign current book based on the detected order in the text
            if current_book is None or len(combined_data) > 0 and int(chapter_num) == 1 and int(verse_num) == 1:
                current_book = book_names[book_index]
                book_index += 1

            # Collect data
            combined_data.append((current_book, chapter_num, verse_num, verse_text.strip()))

# Write the data to the TSV file
with open(output_tsv_file, 'w', encoding='utf-8') as out_file:
    out_file.write("Book\tChapter\tVerse\tText\n")
    for entry in combined_data:
        out_file.write("\t".join(entry) + "\n")

print(f"Extraction complete! {len(combined_data)} verses written to {output_tsv_file}")


Extraction complete! 11722 verses written to OEB_combined.tsv


# WEB

In [4]:
# Unified processing for the World English Bible (WEB)

input_web_file = 'WEB.txt'
output_web_tsv_file = 'WEB_combined.tsv'

# Regex patterns for books, chapters, and verses
book_pattern = re.compile(r"^Book \d+\s+(.+)$")  # Matches lines like "Book 01 Genesis"
verse_pattern = re.compile(r"^(\d{3}):(\d{3})\s+(.+)$")  # Matches lines like "001:001 Text"

# Data collection
web_data = []
current_book = None

with open(input_web_file, 'r', encoding='utf-8') as file:
    for line in file:
        line = line.strip()

        # Detect book titles
        book_match = book_pattern.match(line)
        if book_match:
            current_book = book_match.group(1).strip()
            continue

        # Detect verses
        verse_match = verse_pattern.match(line)
        if verse_match and current_book:
            chapter_num, verse_num, verse_text = verse_match.groups()
            web_data.append((current_book, chapter_num, verse_num, verse_text.strip()))

# Write data to a single TSV file
with open(output_web_tsv_file, 'w', encoding='utf-8') as out_file:
    out_file.write("Book\tChapter\tVerse\tText\n")
    for entry in web_data:
        out_file.write("\t".join(entry) + "\n")

len(web_data), output_web_tsv_file


(31102, 'WEB_combined.tsv')

## KJV

In [7]:
import re

# Input and output files
input_kjv_file = 'KJV.txt'
output_ot_file = 'KJV_OT.tsv'
output_nt_file = 'KJV_NT.tsv'

# Markers for Old and New Testaments
ot_marker = "The Old Testament of the King James Version of the Bible"
nt_marker = "The New Testament of the King James Bible"

# Patterns
book_pattern = re.compile(r"^The (?:Book of )?([\w\s]+)$")  # Matches book names like "Genesis"
verse_pattern = re.compile(r"(\d+):(\d+)\s+(.+)")  # Matches verses like "1:1 Text"

# Data containers
ot_data = []
nt_data = []
current_testament = None
current_book = None
current_text = []
current_chapter, current_verse = None, None

with open(input_kjv_file, 'r', encoding='utf-8') as file:
    for line in file:
        line = line.strip()

        # Detect Testament markers
        if ot_marker in line:
            current_testament = 'OT'
            continue
        elif nt_marker in line:
            current_testament = 'NT'
            continue

        # Detect book names
        book_match = book_pattern.match(line)
        if book_match:
            current_book = book_match.group(1).strip()
            continue

        # Detect verses
        verse_match = verse_pattern.match(line)
        if verse_match:
            # Save the previous verse
            if current_book and current_chapter and current_verse:
                full_text = " ".join(current_text).strip()
                if current_testament == 'OT':
                    ot_data.append((current_book, current_chapter, current_verse, full_text))
                elif current_testament == 'NT':
                    nt_data.append((current_book, current_chapter, current_verse, full_text))

            # Start a new verse
            current_chapter, current_verse, text = verse_match.groups()
            current_text = [text]
        else:
            # Accumulate lines for the current verse
            current_text.append(line)

    # Save the last verse
    if current_book and current_chapter and current_verse:
        full_text = " ".join(current_text).strip()
        if current_testament == 'OT':
            ot_data.append((current_book, current_chapter, current_verse, full_text))
        elif current_testament == 'NT':
            nt_data.append((current_book, current_chapter, current_verse, full_text))

# Write data to TSV files
with open(output_ot_file, 'w', encoding='utf-8') as ot_file:
    ot_file.write("Book\tChapter\tVerse\tText\n")
    for entry in ot_data:
        ot_file.write("\t".join(entry) + "\n")

with open(output_nt_file, 'w', encoding='utf-8') as nt_file:
    nt_file.write("Book\tChapter\tVerse\tText\n")
    for entry in nt_data:
        nt_file.write("\t".join(entry) + "\n")

print(f"Old Testament verses: {len(ot_data)}")
print(f"New Testament verses: {len(nt_data)}")

Old Testament verses: 18963
New Testament verses: 6032


## DRB

In [None]:
DRB_old_test_books = """The Book of Genesis
 The Book of Exodus
 The Book of Leviticus
 The Book of Numbers
 The Book of Deuteronomy
 The Book of Josue
 The Book of Judges
 The Book of Ruth
 The First Book of Samuel, otherwise called the First Book of Kings
 The Second Book of Samuel, otherwise called the Second Book of Kings
 The Third Book of Kings
 The Fourth Book of Kings
 The First Book of Paralipomenon
 The Second Book of Paralipomenon
 The First Book of Esdras
 The Book of Nehemias, which is called the Second of Esdras
 The Book of Tobias
 The Book of Judith
 The Book of Esther
 The Book of Job
 The Book of Psalms
 The Book of Proverbs
 Ecclesiastes
 Solomon’s Canticle of Canticles
 The Book of Wisdom
 Ecclesiasticus
 The Prophecy of Isaias
 The Prophecy of Jeremias
 The Lamentations of Jeremias
 The Prophecy of Baruch
 The Prophecy of Ezechiel
 The Prophecy of Daniel
 The Prophecy of Osee
 The Prophecy of Joel
 The Prophecy of Amos
 The Prophecy of Abdias
 The Prophecy of Jonas
 The Prophecy of Micheas
 The Prophecy of Nahum
 The Prophecy of Habacuc
 The Prophecy of Sophonias
 The Prophecy of Aggeus
 The Prophecy of Zacharias
 The Prophecy of Malachias
 The First Book of Machabees
 The Second Book of Machabees""".split('\n ')

DRB_new_test_books = """The Holy Gospel of Jesus Christ According to St. Matthew
 The Holy Gospel of Jesus Christ According to St. Mark
 The Holy Gospel of Jesus Christ According to St. Luke
 The Holy Gospel of Jesus Christ  According to St. John
 The Acts of the Apostles
 The Epistle of St. Paul the Apostle to the Romans
 The First Epistle of St. Paul to the Corinthians
 The Second Epistle of St. Paul to the Corinthians
 The Epistle of St. Paul to the Galatians
 The Epistle of St. Paul to the Ephesians
 The Epistle of St. Paul to the Philippians
 The Epistle of St. Paul to the Colossians
 The First Epistle of St. Paul to the Thessalonians
 The Second Epistle of St. Paul to the Thessalonians
 The First Epistle of St. Paul to Timothy
 The Second Epistle of St. Paul to Timothy
 The Epistle of St. Paul to Titus
 The Epistle of St. Paul to Philemon
 The Epistle of St. Paul to the Hebrews
 The Catholic Epistle of St. James the Apostle
 The First Epistle of St. Peter the Apostle
 The Second Epistle of St. Peter the Apostle
 The First Epistle of St. John the Apostle
 The Second Epistle of St. John the Apostle
 The Third Epistle of St. John the Apostle
 The Catholic Epistle of St. Jude the Apostle
 The Apocalypse of St. John the Apostle""".split('\n ')
print(DRB_old_test_books)
print(DRB_new_test_books)

Processed 3 verses into DRB_combined.tsv.


In [None]:
txt = "The Book of Leviticus"
print(txt.upper())

In [None]:
import re

# List of book titles (assuming they are already in uppercase)
titles = DRB_old_test_books + DRB_new_test_books
titles = [element.upper() for element in titles]

# Open the input Bible text file
with open('DRV.txt', "r") as f:
    line_counter = 0
    book_name = None
    book_content = ""  # Temporary storage for the current book's content

    # Iterate through each line in the file
    for line in f:
        line_counter += 1
        
        # Only process lines between 145 and 140345
        if line_counter < 145:
            continue  # Skip lines before 145
        if line_counter > 140345:
            break  # Stop processing after line 140345

        line = line.strip()  # Remove leading and trailing whitespace
        
        # Check if the line contains a book title
        for title in titles:
            if title in line:  # If a book title is found
                if book_name:  # Process the previous book if it exists
                    # Save the content of the previous book into a text file
                    with open(f'DRB_{book_name}.txt', 'w') as book_file:
                        book_file.write(book_content)
                
                # Set the new book name and reset content for the new book
                book_name = title
                book_content = ""  # Reset content for the next book
                break  # Stop checking for other titles once the current one is found

        # Append the current line to the book's content (if book_name is set)
        if book_name:
            book_content += line + "\n"  # Add newline between lines of the book

    # Save the last book's content after processing all lines
    if book_name and book_content:
        with open(f'DRB_{book_name}.txt', 'w') as book_file:
            book_file.write(book_content)

print("Books have been saved to separate text files.")
