In [40]:
import re
import pandas as pd
import numpy as np

with open(r"nkjv_index.txt", 'r', encoding='utf8') as f:
    lines = f.readlines()

# file = open(r"complete_bible_nnrv.txt", 'w+', encoding='utf8')
complete_bible_ix = [line.split(" - ") for line in lines]

resulting_df = pd.DataFrame(complete_bible_ix, columns=['book_chapter_verse', 'verse_text'])

In [41]:
resulting_df.head(5)

Unnamed: 0,book_chapter_verse,verse_text
0,Gen1:1,<The History of Creation>In the beginning God ...
1,Gen1:2,"The earth was without form, and void; and dark..."
2,Gen1:3,"Then God said, “Let there be light”; and there..."
3,Gen1:4,"And God saw the light, that <i>it was</i> good..."
4,Gen1:5,"God called the light Day, and the darkness He ..."


In [42]:
resulting_df['cleaned_verse_text'] = resulting_df['verse_text'].apply(lambda x: re.sub("<\w>", "", x))

In [44]:
resulting_df['cleaned_verse_text'] = resulting_df['cleaned_verse_text'].apply(lambda x: re.sub("</\w>", "", x))

In [48]:
resulting_df['cleaned_verse_text_heading'] = resulting_df['cleaned_verse_text'].apply(lambda x: re.sub("<[\w\W]+>", "", x))

In [49]:
resulting_df.head()

Unnamed: 0,book_chapter_verse,verse_text,cleaned_verse_text,cleaned_verse_text_heading
0,Gen1:1,<The History of Creation>In the beginning God ...,<The History of Creation>In the beginning God ...,In the beginning God created the heavens and t...
1,Gen1:2,"The earth was without form, and void; and dark...","The earth was without form, and void; and dark...","The earth was without form, and void; and dark..."
2,Gen1:3,"Then God said, “Let there be light”; and there...","Then God said, “Let there be light”; and there...","Then God said, “Let there be light”; and there..."
3,Gen1:4,"And God saw the light, that <i>it was</i> good...","And God saw the light, that it was good; and G...","And God saw the light, that it was good; and G..."
4,Gen1:5,"God called the light Day, and the darkness He ...","God called the light Day, and the darkness He ...","God called the light Day, and the darkness He ..."


In [50]:
df = resulting_df.copy()

In [51]:
df['verse'] = df['book_chapter_verse'].apply(lambda x: x.split(':')[-1])
df['book_chapter'] = df['book_chapter_verse'].apply(lambda x: x.split(':')[0])

In [52]:
df.head()

Unnamed: 0,book_chapter_verse,verse_text,cleaned_verse_text,cleaned_verse_text_heading,verse,book_chapter
0,Gen1:1,<The History of Creation>In the beginning God ...,<The History of Creation>In the beginning God ...,In the beginning God created the heavens and t...,1,Gen1
1,Gen1:2,"The earth was without form, and void; and dark...","The earth was without form, and void; and dark...","The earth was without form, and void; and dark...",2,Gen1
2,Gen1:3,"Then God said, “Let there be light”; and there...","Then God said, “Let there be light”; and there...","Then God said, “Let there be light”; and there...",3,Gen1
3,Gen1:4,"And God saw the light, that <i>it was</i> good...","And God saw the light, that it was good; and G...","And God saw the light, that it was good; and G...",4,Gen1
4,Gen1:5,"God called the light Day, and the darkness He ...","God called the light Day, and the darkness He ...","God called the light Day, and the darkness He ...",5,Gen1


In [53]:
def chapter_extractor(row):
    """This function returns ('Gen', '1') if book_chapter is "Gen1".
    """
    digits = list("0123456789") # digits = ['0', '1', '2', ..., '9']
    
    if row[-2] not in digits: 
        return (row[:-1], row[-1])
    elif row[-3] not in digits:
        return (row[:-2], row[-2:])
    elif row[-4] not in digits:
        return (row[:-3], row[-3:])
    else:
        return (np.pan, np.nan)

In [54]:
df['book_chapter'] = df['book_chapter'].apply(lambda x: chapter_extractor(x))
df['book'] = df['book_chapter'].apply(lambda x: x[0])
df['chapter'] = df['book_chapter'].apply(lambda x: x[1])

In [55]:
df.head(5)

Unnamed: 0,book_chapter_verse,verse_text,cleaned_verse_text,cleaned_verse_text_heading,verse,book_chapter,book,chapter
0,Gen1:1,<The History of Creation>In the beginning God ...,<The History of Creation>In the beginning God ...,In the beginning God created the heavens and t...,1,"(Gen, 1)",Gen,1
1,Gen1:2,"The earth was without form, and void; and dark...","The earth was without form, and void; and dark...","The earth was without form, and void; and dark...",2,"(Gen, 1)",Gen,1
2,Gen1:3,"Then God said, “Let there be light”; and there...","Then God said, “Let there be light”; and there...","Then God said, “Let there be light”; and there...",3,"(Gen, 1)",Gen,1
3,Gen1:4,"And God saw the light, that <i>it was</i> good...","And God saw the light, that it was good; and G...","And God saw the light, that it was good; and G...",4,"(Gen, 1)",Gen,1
4,Gen1:5,"God called the light Day, and the darkness He ...","God called the light Day, and the darkness He ...","God called the light Day, and the darkness He ...",5,"(Gen, 1)",Gen,1


In [56]:
bible_abbr_df = pd.read_csv("bible_abbreviations_final.csv")

In [57]:
bible_abbr_df.head(4)

Unnamed: 0,book_id,testament_id,book_name,book_abbreviation
0,1,1,Genesis,Gen
1,2,1,Exodus,Ex
2,3,1,Leviticus,Lev
3,4,1,Numbers,Num


In [58]:
abbr_books_list = list(bible_abbr_df['book_abbreviation'].unique())
full_books_list = list(bible_abbr_df['book_name'].unique())
db_books_list = list(df['book'].unique())

books_abbr_mapping = pd.DataFrame({
    "abbr_books_format": abbr_books_list,
    "abbr_df_format": db_books_list,
    "full_book_name": full_books_list
})

In [59]:
books_abbr_mapping = books_abbr_mapping.set_index(pd.Index(list(range(1, 67)))).reset_index()

In [60]:
books_abbr_mapping.head(5)

Unnamed: 0,index,abbr_books_format,abbr_df_format,full_book_name
0,1,Gen,Gen,Genesis
1,2,Ex,Exo,Exodus
2,3,Lev,Lev,Leviticus
3,4,Num,Num,Numbers
4,5,Deut,Deu,Deuteronomy


In [61]:
df = df.merge(books_abbr_mapping[['abbr_df_format', 'full_book_name', 'index']], left_on="book", right_on="abbr_df_format")

In [62]:
df.head()

Unnamed: 0,book_chapter_verse,verse_text,cleaned_verse_text,cleaned_verse_text_heading,verse,book_chapter,book,chapter,abbr_df_format,full_book_name,index
0,Gen1:1,<The History of Creation>In the beginning God ...,<The History of Creation>In the beginning God ...,In the beginning God created the heavens and t...,1,"(Gen, 1)",Gen,1,Gen,Genesis,1
1,Gen1:2,"The earth was without form, and void; and dark...","The earth was without form, and void; and dark...","The earth was without form, and void; and dark...",2,"(Gen, 1)",Gen,1,Gen,Genesis,1
2,Gen1:3,"Then God said, “Let there be light”; and there...","Then God said, “Let there be light”; and there...","Then God said, “Let there be light”; and there...",3,"(Gen, 1)",Gen,1,Gen,Genesis,1
3,Gen1:4,"And God saw the light, that <i>it was</i> good...","And God saw the light, that it was good; and G...","And God saw the light, that it was good; and G...",4,"(Gen, 1)",Gen,1,Gen,Genesis,1
4,Gen1:5,"God called the light Day, and the darkness He ...","God called the light Day, and the darkness He ...","God called the light Day, and the darkness He ...",5,"(Gen, 1)",Gen,1,Gen,Genesis,1


In [63]:
final_bible_df = df[['index', 'chapter', 'verse', 'cleaned_verse_text_heading']]

In [64]:
# # save as csv
# final_bible_df.to_csv("bible_cleaned_formatted_nkjv.csv", index=False, header=False)