## Data Analysis

In [None]:
import re

# OEB

In [None]:
import re

# File paths
input_file = 'OEB.txt'
output_tsv_file = 'OEB_combined.tsv'

# Book names extracted from the table of contents
book_names = [
    "Ruth", "Esther", "Psalms", "Hosea", "Joel", "Amos", "Obadiah", "Jonah", 
    "Micah", "Nahum", "Habakkuk", "Zephaniah", "Haggai", "Zechariah", "Malachi",
    "Matthew", "Mark", "Luke", "John", "Acts", "Romans", "1 Corinthians", 
    "2 Corinthians", "Galatians", "Ephesians", "Philippians", "Colossians", 
    "1 Thessalonians", "2 Thessalonians", "1 Timothy", "2 Timothy", "Titus", 
    "Philemon", "Hebrews", "James", "1 Peter", "2 Peter", "1 John", "2 John", 
    "3 John", "Jude", "Revelation"
]

# Regex pattern to detect verses
verse_pattern = re.compile(r"\[(\d+:\d+)\]\s+(.+?)(?=\[\d+:\d+\]|$)")

# Data collection
combined_data = []
current_book = None

# Use the order of books from the table of contents to infer context
book_index = 0

with open(input_file, 'r', encoding='utf-8') as file:
    for line in file:
        line = line.strip()

        # Detect verses
        for verse_match in verse_pattern.finditer(line):
            verse_id, verse_text = verse_match.groups()
            chapter_num, verse_num = verse_id.split(":")

            # Assign current book based on the detected order in the text
            if current_book is None or len(combined_data) > 0 and int(chapter_num) == 1 and int(verse_num) == 1:
                current_book = book_names[book_index]
                book_index += 1

            # Collect data
            combined_data.append((current_book, chapter_num, verse_num, verse_text.strip()))

# Write the data to the TSV file
with open(output_tsv_file, 'w', encoding='utf-8') as out_file:
    out_file.write("Book\tChapter\tVerse\tText\n")
    for entry in combined_data:
        out_file.write("\t".join(entry) + "\n")

print(f"Extraction complete! {len(combined_data)} verses written to {output_tsv_file}")


# WEB

In [None]:
import re

# File paths for input and output
input_web_file = "WEB.txt"
output_web_tsv_file = "WEB_combined.tsv"

# Regex patterns for books, chapters, and verses
book_pattern = re.compile(r"^Book \d+\s+(.+)$")  # Matches lines like "Book 01 Genesis"
verse_pattern = re.compile(r"^(\d{3}):(\d{3})\s+(.+)$")  # Matches lines like "001:001 Text"

# Function to normalize chapter and verse numbers
def normalize_number(value):
    return str(int(value))  # Remove leading zeros by converting to integer and back to string

# Function to clean text
def clean_text(text):
    return re.sub(r'[\"“”]', '', text).strip()  # Remove quotation marks and clean whitespace

# Data collection
web_data = []
current_book = None
current_verse = None
current_text = []

with open(input_web_file, "r", encoding="utf-8") as file:
    for line in file:
        line = line.strip()

        # Detect book titles
        book_match = book_pattern.match(line)
        if book_match:
            current_book = book_match.group(1).strip()
            continue

        # Detect verses
        verse_match = verse_pattern.match(line)
        if verse_match and current_book:
            # Save the previous verse if it exists
            if current_verse and current_text:
                full_text = " ".join(current_text).strip()
                full_text = clean_text(full_text)  # Clean the text
                web_data.append((current_book, *current_verse, full_text))

            # Start a new verse
            chapter_num, verse_num, text = verse_match.groups()
            chapter_num = normalize_number(chapter_num)  # Normalize chapter number
            verse_num = normalize_number(verse_num)      # Normalize verse number
            current_verse = (chapter_num, verse_num)
            current_text = [text]
        else:
            # Accumulate lines for the current verse
            current_text.append(line)

    # Save the last verse
    if current_verse and current_text:
        full_text = " ".join(current_text).strip()
        full_text = clean_text(full_text)  # Clean the text
        web_data.append((current_book, *current_verse, full_text))

# Write data to a single TSV file
with open(output_web_tsv_file, "w", encoding="utf-8") as out_file:
    out_file.write("Book\tChapter\tVerse\tText\n")
    for entry in web_data:
        out_file.write("\t".join(entry) + "\n")

print(f"Processed {len(web_data)} entries. Cleaned WEB file saved as {output_web_tsv_file}.")





## KJV

In [None]:
import re

# File paths
input_kjv_file = 'KJV.txt'
reformatted_kjv_file = 'KJV_reformatted.txt'

# Regex to match Chapter:Verse markers
verse_marker = re.compile(r"(\d+:\d+)")

# Step 1: Reformat the file to ensure each verse starts on a new line
with open(input_kjv_file, 'r', encoding='utf-8') as infile, open(reformatted_kjv_file, 'w', encoding='utf-8') as outfile:
    for line in infile:
        # Replace inline Chapter:Verse markers with new-line-prefixed markers
        reformatted_line = verse_marker.sub(r"\n\1", line.strip())
        outfile.write(reformatted_line + "\n")

print(f"Reformatted file saved as: {reformatted_kjv_file}")



In [None]:
import re

# File paths for input and output
input_kjv_file = 'KJV_reformatted.txt'
output_ot_file = 'KJV_OT.tsv'
output_nt_file = 'KJV_NT.tsv'

# Markers for Old and New Testaments
ot_marker = "The Old Testament of the King James Version of the Bible"
nt_marker = "The New Testament of the King James Bible"

# Regex to match verses
verse_pattern = re.compile(r"(\d+):(\d+)\s+(.+?)$", re.DOTALL)

# Data containers
ot_data = []
nt_data = []
current_testament = None
current_book = None
current_verse = None
current_text = []

# Step 1: Read the file line by line
with open(input_kjv_file, 'r', encoding='utf-8') as file:
    for line in file:
        line = line.strip()

        # Detect Testament markers
        if ot_marker in line:
            current_testament = 'OT'
            continue
        elif nt_marker in line:
            current_testament = 'NT'
            continue

        # Detect book names
        if line.startswith("The ") and ("Book" in line or "Epistle" in line):
            current_book = line.strip()
            continue

        # Check for a verse match
        verse_match = verse_pattern.match(line)
        if verse_match:
            # Save the current verse if any
            if current_verse and current_text:
                full_text = " ".join(current_text).strip()
                if current_testament == 'OT':
                    ot_data.append((current_book, current_verse[0], current_verse[1], full_text))
                elif current_testament == 'NT':
                    nt_data.append((current_book, current_verse[0], current_verse[1], full_text))

            # Start a new verse
            chapter, verse, text = verse_match.groups()
            current_verse = (chapter, verse)
            current_text = [text]
        else:
            # Accumulate text for the current verse
            current_text.append(line)

    # Save the last verse
    if current_verse and current_text:
        full_text = " ".join(current_text).strip()
        if current_testament == 'OT':
            ot_data.append((current_book, current_verse[0], current_verse[1], full_text))
        elif current_testament == 'NT':
            nt_data.append((current_book, current_verse[0], current_verse[1], full_text))

# Step 2: Write the output
with open(output_ot_file, 'w', encoding='utf-8') as ot_file:
    ot_file.write("Book\tChapter\tVerse\tText\n")
    for entry in ot_data:
        ot_file.write("\t".join(entry) + "\n")

with open(output_nt_file, 'w', encoding='utf-8') as nt_file:
    nt_file.write("Book\tChapter\tVerse\tText\n")
    for entry in nt_data:
        nt_file.write("\t".join(entry) + "\n")

print(f"Old Testament verses: {len(ot_data)}")
print(f"New Testament verses: {len(nt_data)}")




## DRB

In [None]:
DRB_old_test_books = """The Book of Genesis
 The Book of Exodus
 The Book of Leviticus
 The Book of Numbers
 The Book of Deuteronomy
 The Book of Josue
 The Book of Judges
 The Book of Ruth
 The First Book of Samuel, otherwise called the First Book of Kings
 The Second Book of Samuel, otherwise called the Second Book of Kings
 The Third Book of Kings
 The Fourth Book of Kings
 The First Book of Paralipomenon
 The Second Book of Paralipomenon
 The First Book of Esdras
 The Book of Nehemias, which is called the Second of Esdras
 The Book of Tobias
 The Book of Judith
 The Book of Esther
 The Book of Job
 The Book of Psalms
 The Book of Proverbs
 Ecclesiastes
 Solomon’s Canticle of Canticles
 The Book of Wisdom
 Ecclesiasticus
 The Prophecy of Isaias
 The Prophecy of Jeremias
 The Lamentations of Jeremias
 The Prophecy of Baruch
 The Prophecy of Ezechiel
 The Prophecy of Daniel
 The Prophecy of Osee
 The Prophecy of Joel
 The Prophecy of Amos
 The Prophecy of Abdias
 The Prophecy of Jonas
 The Prophecy of Micheas
 The Prophecy of Nahum
 The Prophecy of Habacuc
 The Prophecy of Sophonias
 The Prophecy of Aggeus
 The Prophecy of Zacharias
 The Prophecy of Malachias
 The First Book of Machabees
 The Second Book of Machabees""".split('\n ')

DRB_new_test_books = """The Holy Gospel of Jesus Christ According to St. Matthew
 The Holy Gospel of Jesus Christ According to St. Mark
 The Holy Gospel of Jesus Christ According to St. Luke
 The Holy Gospel of Jesus Christ  According to St. John
 The Acts of the Apostles
 The Epistle of St. Paul the Apostle to the Romans
 The First Epistle of St. Paul to the Corinthians
 The Second Epistle of St. Paul to the Corinthians
 The Epistle of St. Paul to the Galatians
 The Epistle of St. Paul to the Ephesians
 The Epistle of St. Paul to the Philippians
 The Epistle of St. Paul to the Colossians
 The First Epistle of St. Paul to the Thessalonians
 The Second Epistle of St. Paul to the Thessalonians
 The First Epistle of St. Paul to Timothy
 The Second Epistle of St. Paul to Timothy
 The Epistle of St. Paul to Titus
 The Epistle of St. Paul to Philemon
 The Epistle of St. Paul to the Hebrews
 The Catholic Epistle of St. James the Apostle
 The First Epistle of St. Peter the Apostle
 The Second Epistle of St. Peter the Apostle
 The First Epistle of St. John the Apostle
 The Second Epistle of St. John the Apostle
 The Third Epistle of St. John the Apostle
 The Catholic Epistle of St. Jude the Apostle
 The Apocalypse of St. John the Apostle""".split('\n ')
print(DRB_old_test_books)
print(DRB_new_test_books)

In [None]:
txt = "The Book of Leviticus"
print(txt.upper())

In [None]:
import re

# List of book titles (assuming they are already in uppercase)
titles = DRB_old_test_books + DRB_new_test_books
titles = [element.upper() for element in titles]

# Open the input Bible text file
with open('DRV.txt', "r") as f:
    line_counter = 0
    book_name = None
    book_content = ""  # Temporary storage for the current book's content

    # Iterate through each line in the file
    for line in f:
        line_counter += 1
        
        # Only process lines between 145 and 140345
        if line_counter < 145:
            continue  # Skip lines before 145
        if line_counter > 140345:
            break  # Stop processing after line 140345

        line = line.strip()  # Remove leading and trailing whitespace
        
        # Check if the line contains a book title
        for title in titles:
            if title in line:  # If a book title is found
                if book_name:  # Process the previous book if it exists
                    # Save the content of the previous book into a text file
                    with open(f'DRB_{book_name}.txt', 'w') as book_file:
                        book_file.write(book_content)
                
                # Set the new book name and reset content for the new book
                book_name = title
                book_content = ""  # Reset content for the next book
                break  # Stop checking for other titles once the current one is found

        # Append the current line to the book's content (if book_name is set)
        if book_name:
            book_content += line + "\n"  # Add newline between lines of the book

    # Save the last book's content after processing all lines
    if book_name and book_content:
        with open(f'DRB_{book_name}.txt', 'w') as book_file:
            book_file.write(book_content)

print("Books have been saved to separate text files.")


# Save unique book titles

In [None]:
import pandas as pd

# File paths for the input TSV files
file_paths = {
    "DRB": "DRB_preprocessed_columns.tsv",
    "KJV_NT": "KJV_NT.tsv",
    "KJV_OT": "KJV_OT.tsv",
    "OEB": "OEB_combined.tsv",
    "WEB": "WEB_combined.tsv"
}

# Dictionary to store book titles for each version
titles_dict = {}

# Extract unique book titles from each file and store in the dictionary
for version, path in file_paths.items():
    try:
        # Load the TSV file
        df = pd.read_csv(path, sep="\t")
        
        # Extract unique book titles and normalize
        books = sorted(df["Book"].str.strip().str.lower().unique())
        
        # Add to dictionary
        titles_dict[version] = books
    except Exception as e:
        print(f"Error processing {version}: {e}")
        titles_dict[version] = []

# Create a DataFrame with unique book titles as rows and versions as columns
unique_titles_df = pd.DataFrame(dict([(k, pd.Series(v)) for k, v in titles_dict.items()]))

# Save the table to a CSV file
output_path = "unique_book_titles_normalized.csv"
unique_titles_df.to_csv(output_path, index=False)

print(f"Unique book titles have been normalized and saved to {output_path}.")


## Combine the two KJV

In [None]:
import pandas as pd

# File paths for KJV Old and New Testament
kjv_ot_path = "KJV_OT.tsv"
kjv_nt_path = "KJV_NT.tsv"
combined_kjv_path = "KJV_combined.tsv"

# Load the Old and New Testament data
kjv_ot = pd.read_csv(kjv_ot_path, sep="\t")
kjv_nt = pd.read_csv(kjv_nt_path, sep="\t")

# Combine the datasets
kjv_combined = pd.concat([kjv_ot, kjv_nt]).drop_duplicates()

# Save the combined dataset
kjv_combined.to_csv(combined_kjv_path, sep="\t", index=False)

print(f"Combined KJV saved to {combined_kjv_path}.")

In [28]:
import pandas as pd

# File paths for the input TSV files
file_paths = {
    "DRB": "DRB_normalized_titles.tsv",
    "KJV_combined": "KJV_combined.tsv",
    "OEB": "OEB_combined.tsv",
    "WEB": "WEB_combined.tsv"
}

# Function to extract only words (no numbers) from a title
def extract_words(title):
    tokens = title.lower().split()  # Split into tokens
    return set(token for token in tokens if not token.isdigit())  # Exclude numbers

# Load and normalize titles from all files
titles_dict = {}
for version, path in file_paths.items():
    try:
        df = pd.read_csv(path, sep="\t")
        # Extract unique titles and normalize
        titles_dict[version] = sorted(df["Book"].str.strip().str.lower().unique())
    except Exception as e:
        print(f"Error processing {version}: {e}")
        titles_dict[version] = []

# Debug: Print all titles for inspection
for version, titles in titles_dict.items():
    print(f"\n{version} Titles ({len(titles)}):")
    print(titles)

# Tokenize the OEB titles
oeb_titles = titles_dict["OEB"]
oeb_words = {title: extract_words(title) for title in oeb_titles}

# Debug: Print tokenized OEB titles
print("\nTokenized OEB Titles:")
print(oeb_words)

# Compare OEB titles with all other versions for exact word matches
results = []

for oeb_title, oeb_words_set in oeb_words.items():
    print(f"\nOEB Title: {oeb_title}")
    print(f"Tokens: {oeb_words_set}")
    matched_versions = []
    matched_titles = []
    for version, titles in titles_dict.items():
        if version == "OEB":
            continue
        for title in titles:
            title_words = extract_words(title)
            if oeb_words_set & title_words:  # Intersection of words
                matched_versions.append(version)
                matched_titles.append((version, title))
                print(f"Matched {version}: {title} (Tokens: {title_words})")
    # Filter titles matching all 4 versions
    if len(set(matched_versions)) >= 3:
        print(f"OEB Title '{oeb_title}' matches all 4 versions.")
        for version, matched_title in matched_titles:
            results.append({
                "OEB Title": oeb_title,
                "Matched Version": version,
                "Matched Title": matched_title,
                "Matched Words": ", ".join(oeb_words_set & extract_words(matched_title))  # Common words
            })

# Convert the results to a DataFrame for better readability
results_df = pd.DataFrame(results)

# Save the results to a CSV file
output_path = "oeb_all_versions_matches_debugged.csv"
results_df.to_csv(output_path, index=False)

results_df
print(f"\nMatching titles across all 4 versions saved to {output_path}.")



DRB Titles (70):
['ecclesiastes', 'ecclesiasticus', 'solomon’s canticle of canticles', 'the acts of the apostles', 'the apocalypse of st. john the apostle', 'the book of deuteronomy', 'the book of esther', 'the book of exodus', 'the book of genesis', 'the book of job', 'the book of josue', 'the book of judges', 'the book of judith', 'the book of leviticus', 'the book of nehemias, which is called the second of esdras', 'the book of numbers', 'the book of proverbs', 'the book of psalms', 'the book of ruth', 'the book of tobias', 'the book of wisdom', 'the catholic epistle of st. james the apostle', 'the epistle of st. paul the apostle to the romans', 'the epistle of st. paul to philemon', 'the epistle of st. paul to the colossians', 'the epistle of st. paul to the ephesians', 'the epistle of st. paul to the galatians', 'the epistle of st. paul to the hebrews', 'the epistle of st. paul to the philippians', 'the epistle of st. paul to titus', 'the first book of esdras', 'the first book of

In [29]:
import pandas as pd
import re

# File paths for the input TSV files
file_paths = {
    "DRB": "DRB_normalized_titles.tsv",
    "KJV": "KJV_combined.tsv",
    "OEB": "OEB_combined.tsv",
    "WEB": "WEB_combined.tsv"
}

# Function to normalize a title (convert ordinals and clean)
def normalize_title(title):
    # Replace numbers with ordinals
    title = re.sub(r'\b1\b', 'first', title, flags=re.IGNORECASE)
    title = re.sub(r'\b2\b', 'second', title, flags=re.IGNORECASE)
    title = re.sub(r'\b3\b', 'third', title, flags=re.IGNORECASE)
    # Clean title (standardize phrases and lowercase)
    title = re.sub(r'\bthe\b', '', title, flags=re.IGNORECASE)  # Remove "the"
    title = re.sub(r'\bepistle of\b', 'epistle', title, flags=re.IGNORECASE)
    title = title.lower().strip()
    return title

# Load and normalize titles for all versions
aligned_titles_dict = {}
for version, path in file_paths.items():
    try:
        # Load the file
        df = pd.read_csv(path, sep="\t")
        # Extract unique titles and normalize them
        unique_titles = df["Book"].unique()
        normalized_titles = [normalize_title(title) for title in unique_titles]
        aligned_titles_dict[version] = normalized_titles
    except Exception as e:
        print(f"Error processing {version}: {e}")
        aligned_titles_dict[version] = []

# Create a mapping of aligned titles
aligned_mapping = []
for oeb_title in set(aligned_titles_dict["OEB"]):  # Use OEB as the base
    oeb_tokens = set(oeb_title.split())  # Tokenize the OEB title
    row = {"OEB": oeb_title}
    for version, titles in aligned_titles_dict.items():
        if version == "OEB":
            continue
        # Find matching titles that contain all OEB tokens
        matches = [
            title for title in titles
            if oeb_tokens.issubset(set(title.split()))  # Check if all OEB tokens exist
        ]
        row[version] = matches[0] if matches else None  # Take the first match or None
    aligned_mapping.append(row)

# Convert the mapping into a DataFrame
aligned_df = pd.DataFrame(aligned_mapping)

# Save the aligned DataFrame to a CSV file for review
aligned_df.to_csv("aligned_book_titles.csv", index=False)

# Print a sample of the DataFrame for verification
print("Aligned Book Titles Mapping:")
print(aligned_df.head())


Aligned Book Titles Mapping:
                    OEB                                        DRB  \
0            colossians            epistle st. paul to  colossians   
1                  amos                           prophecy of amos   
2                psalms                             book of psalms   
3           first peter           first epistle st. peter  apostle   
4  second thessalonians  second epistle st. paul to  thessalonians   

                                              KJV                   WEB  
0            epistle paul  apostle to  colossians            colossians  
1                                            None                  amos  
2                                  book of psalms                psalms  
3                  first epistle general of peter           first peter  
4  second epistle paul  apostle to  thessalonians  second thessalonians  


## Align all titles

In [31]:
import pandas as pd
import re

# File paths for the input TSV files
file_paths = {
    "DRB": "DRB_normalized_titles.tsv",
    "KJV": "KJV_combined.tsv",
    "OEB": "OEB_combined.tsv",
    "WEB": "WEB_combined.tsv"
}

# Function to normalize a title (convert ordinals and clean)
def normalize_title(title):
    # Replace numbers with ordinals
    title = re.sub(r'\b1\b', 'first', title, flags=re.IGNORECASE)
    title = re.sub(r'\b2\b', 'second', title, flags=re.IGNORECASE)
    title = re.sub(r'\b3\b', 'third', title, flags=re.IGNORECASE)
    # Clean title (standardize phrases and lowercase)
    title = re.sub(r'\bthe\b', '', title, flags=re.IGNORECASE)  # Remove "the"
    title = re.sub(r'\bepistle of\b', 'epistle', title, flags=re.IGNORECASE)
    title = title.lower().strip()
    return title

# Load and normalize titles for all versions
normalized_titles_dict = {}
for version, path in file_paths.items():
    try:
        # Load the file
        df = pd.read_csv(path, sep="\t")
        # Extract unique titles and normalize them
        unique_titles = df["Book"].unique()
        normalized_titles = [normalize_title(title) for title in unique_titles]
        normalized_titles_dict[version] = normalized_titles
    except Exception as e:
        print(f"Error processing {version}: {e}")
        normalized_titles_dict[version] = []

# Combine all titles with OEB as the baseline
all_titles = set(normalized_titles_dict["OEB"])  # Start with OEB titles
for version, titles in normalized_titles_dict.items():
    if version != "OEB":
        all_titles.update(titles)  # Add unique titles from other versions

# Create a comprehensive alignment table
aligned_mapping = []
for title in all_titles:  # Loop through every unique title
    row = {"OEB": "N/A", "DRB": "N/A", "KJV": "N/A", "WEB": "N/A"}  # Initialize with N/A
    for version, titles in normalized_titles_dict.items():
        # Check if the title exists in this version
        matches = [t for t in titles if set(title.split()) <= set(t.split())]
        row[version] = matches[0] if matches else "N/A"  # Add the matched title or N/A
    aligned_mapping.append(row)

# Convert the mapping into a DataFrame
aligned_df = pd.DataFrame(aligned_mapping)

# Add a helper column to explicitly handle N/A sorting
aligned_df["OEB_Sort_Key"] = aligned_df["OEB"].apply(lambda x: "zzz" if x == "N/A" else x)

# Sort the DataFrame alphabetically by OEB, placing N/A values last
aligned_df = aligned_df.sort_values(by="OEB_Sort_Key").drop(columns=["OEB_Sort_Key"])

# Save the aligned DataFrame to a CSV file
aligned_df.to_csv("aligned_book_titles_with_na_sorted.csv", index=False)

# Print a sample of the sorted DataFrame for verification
print("Comprehensive Aligned Book Titles with N/A (Sorted):")
print(aligned_df.head())

Comprehensive Aligned Book Titles with N/A (Sorted):
            OEB                              DRB  \
82         acts                acts of  apostles   
42         amos                 prophecy of amos   
75   colossians  epistle st. paul to  colossians   
135   ephesians   epistle st. paul to  ephesians   
70       esther                   book of esther   

                                      KJV         WEB  
82                                    N/A        acts  
42                                    N/A        amos  
75   epistle paul  apostle to  colossians  colossians  
135   epistle paul  apostle to  ephesians   ephesians  
70                         book of esther      esther  


## add index to kept book titles

In [32]:
import pandas as pd

# File path for the input file
input_file = "aligned_book_titles.csv"
output_file = "aligned_book_titles_with_index.csv"

# Load the file
df = pd.read_csv(input_file)

# Add an index column starting from 1
df.index = range(1, len(df) + 1)
df.index.name = "Index"  # Name the index column (optional)

# Save the updated DataFrame
df.to_csv(output_file, index=True)

print(f"File with index added saved to {output_file}.")


File with index added saved to aligned_book_titles_with_index.csv.


## filter all versions based on common book titles separately

In [34]:
import pandas as pd
import re

# File paths
aligned_file = "aligned_book_titles_with_index.csv"
file_paths = {
    "OEB": "OEB_combined.tsv",
    "DRB": "DRB_normalized_titles.tsv",
    "KJV": "KJV_combined.tsv",
    "WEB": "WEB_combined.tsv"
}

def normalize_title(title):
    if not isinstance(title, str):
        return title  # Return the title as-is if it's not a string (e.g., NaN)
    title = re.sub(r'\b1\b', 'first', title, flags=re.IGNORECASE)
    title = re.sub(r'\b2\b', 'second', title, flags=re.IGNORECASE)
    title = re.sub(r'\b3\b', 'third', title, flags=re.IGNORECASE)
    title = re.sub(r'\bthe\b', '', title, flags=re.IGNORECASE)
    title = re.sub(r'\bepistle of\b', 'epistle', title, flags=re.IGNORECASE)
    return title.lower().strip()

# Step 1: Load the aligned titles file
aligned_df = pd.read_csv(aligned_file)

# Step 2: Create a mapping of normalized titles to actual titles for each version
title_mappings = {}
for version, path in file_paths.items():
    try:
        # Load the TSV file
        df = pd.read_csv(path, sep="\t")
        # Normalize the titles and create a mapping
        df["Normalized Book"] = df["Book"].apply(normalize_title)
        title_mappings[version] = dict(zip(df["Normalized Book"], df["Book"]))
    except Exception as e:
        print(f"Error processing {version}: {e}")
        title_mappings[version] = {}

# Step 3: Filter each version based on the aligned titles
for version, path in file_paths.items():
    try:
        # Load the respective TSV file
        version_df = pd.read_csv(path, sep="\t")
        
        # Prepare the filtered DataFrame
        filtered_rows = []

        # Iterate over each row in the alignment file
        for _, row in aligned_df.iterrows():
            index = row["Index"]  # Get the index
            aligned_title = row[version]  # Get the aligned title for this version
            
            # Find the corresponding actual title from the mapping
            actual_title = title_mappings[version].get(normalize_title(aligned_title), None)
            if actual_title:
                # Filter rows matching the actual title
                matched_rows = version_df[version_df["Book"] == actual_title].copy()
                matched_rows["Index"] = index  # Add the index to the matched rows
                filtered_rows.append(matched_rows)

        # Combine all filtered rows
        if filtered_rows:
            filtered_version_df = pd.concat(filtered_rows, ignore_index=True)
        else:
            filtered_version_df = pd.DataFrame(columns=version_df.columns.tolist() + ["Index"])

        # Save the filtered DataFrame
        output_file = f"{version}_filtered_with_index.tsv"
        filtered_version_df.to_csv(output_file, sep="\t", index=False)
        print(f"Filtered data for {version} saved to {output_file}. Rows: {len(filtered_version_df)}")
    except Exception as e:
        print(f"Error processing {version}: {e}")


Filtered data for OEB saved to OEB_filtered_with_index.tsv. Rows: 11722
Filtered data for DRB saved to DRB_filtered_with_index.tsv. Rows: 9712
Filtered data for KJV saved to KJV_filtered_with_index.tsv. Rows: 7165
Filtered data for WEB saved to WEB_filtered_with_index.tsv. Rows: 11721


## align everything

In [35]:
import pandas as pd

# File paths for the filtered TSV files
file_paths = {
    "DRB": "DRB_filtered_with_index.tsv",
    "KJV": "KJV_filtered_with_index.tsv",
    "OEB": "OEB_filtered_with_index.tsv",
    "WEB": "WEB_filtered_with_index.tsv"
}

# List to hold DataFrames for each version
dataframes = []

# Add a "Version" column to each TSV file and load it
for version, path in file_paths.items():
    try:
        df = pd.read_csv(path, sep="\t")
        df["Version"] = version  # Add version column
        dataframes.append(df)
    except Exception as e:
        print(f"Error processing {version}: {e}")

# Combine all DataFrames into one
combined_df = pd.concat(dataframes, ignore_index=True)

# Ensure Chapter and Verse columns are integers for proper sorting
combined_df["Chapter"] = combined_df["Chapter"].astype(int)
combined_df["Verse"] = combined_df["Verse"].astype(int)

# Sort by Book, then by Version, Chapter, and Verse
combined_df = combined_df.sort_values(by=["Book", "Version", "Chapter", "Verse"])

# Reorder columns for readability
column_order = ["Index", "Version", "Book", "Chapter", "Verse", "Text"]
combined_df = combined_df[column_order]

# Save to a new TSV file
output_file = "aligned_bible_data.tsv"
combined_df.to_csv(output_file, sep="\t", index=False)

print(f"Combined file grouped by Book saved to: {output_file}")



Combined file grouped by Book saved to: aligned_bible_data.tsv


## sorted alignment

In [36]:
import pandas as pd

# Load the combined data
combined_file = "aligned_bible_data.tsv"
df = pd.read_csv(combined_file, sep="\t")

# Ensure Chapter and Verse are treated as integers
df["Chapter"] = df["Chapter"].astype(int, errors="ignore")
df["Verse"] = df["Verse"].astype(int, errors="ignore")

# Sort the DataFrame
df = df.sort_values(
    by=["Index", "Version", "Book", "Chapter", "Verse"],  # Sorting keys
)

# Save to a new file
output_file = "sorted_aligned_bible_data.tsv"
df.to_csv(output_file, sep="\t", index=False)

print(f"Sorted combined data saved to: {output_file}")



Sorted combined data saved to: sorted_aligned_bible_data.tsv


In [37]:
import pandas as pd
import re

# File paths
input_file = "DRB_preprocessed_columns.tsv"
output_file = "DRB_normalized_titles.tsv"

# List of canonical book titles
canonical_titles = DRB_old_test_books + DRB_new_test_books

# Function to normalize a title
def normalize_title(title):
    # Convert title to lowercase and strip whitespace
    title = title.strip().lower()
    # Match with canonical titles (case insensitive)
    for canonical in canonical_titles:
        if re.fullmatch(canonical.strip().lower(), title):
            return canonical  # Return the canonical title if it matches
    return title  # Return original title if no match is found

# Read the TSV file
df = pd.read_csv(input_file, sep="\t")

# Normalize the "Book" column
df["Book"] = df["Book"].apply(normalize_title)

# Save the updated DataFrame to a new TSV file
df.to_csv(output_file, sep="\t", index=False)

print(f"Book titles in '{input_file}' have been normalized and saved to '{output_file}'.")

Book titles in 'DRB_preprocessed_columns.tsv' have been normalized and saved to 'DRB_normalized_titles.tsv'.
