# Abstract

### Import Files

In [110]:
import requests
import pandas as pd
import numpy as np
import json
import re
import string
import matplotlib.pyplot as plt
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from keras import models
from keras.layers import Dense, Activation
from gensim.models import Word2Vec

from API import My_API

%load_ext autoreload
%autoreload 2
%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Accessing APIs for Bible text

To store the different versions of the Bible, we will use a dictionary as the main structure to hold the content of each, called `versions`

In [2]:
versions = {}

In this section, the most care had to be taken into consideration when pulling information from each API.  In an attempt to access the text as easily as possible, I am using a custom API to accesss each site. The queries are inserted and built by hand using the respective documentation at each site.  

#### Get Cannonical books

The API _Biblia API_ is a very useful site that is well documented and also has services built in to get the contents fo each specific bible.  To make the text retrieval process more seamless, the list of books within standard canonical bibles are retrieved, then stored for later use within different queries.

In [3]:
## Pull the canonical books from Website
my_api = My_API(url="https://api.biblia.com/v1/bible/", key='fd37d8f28e95d3be8cb4fbc37e15e18e')
query = 'contents/KJV?'
resp = my_api.run_query(query)

## Strip out the books and remvoe whitespace with '_' 
canonical_books = [book['passage'].replace(" ","_").strip() for book in resp['books']]

In [4]:
canonical_books

['Genesis',
 'Exodus',
 'Leviticus',
 'Numbers',
 'Deuteronomy',
 'Joshua',
 'Judges',
 'Ruth',
 '1_Samuel',
 '2_Samuel',
 '1_Kings',
 '2_Kings',
 '1_Chronicles',
 '2_Chronicles',
 'Ezra',
 'Nehemiah',
 'Esther',
 'Job',
 'Psalms',
 'Proverbs',
 'Ecclesiastes',
 'Song_of_Solomon',
 'Isaiah',
 'Jeremiah',
 'Lamentations',
 'Ezekiel',
 'Daniel',
 'Hosea',
 'Joel',
 'Amos',
 'Obadiah',
 'Jonah',
 'Micah',
 'Nahum',
 'Habakkuk',
 'Zephaniah',
 'Haggai',
 'Zechariah',
 'Malachi',
 'Matthew',
 'Mark',
 'Luke',
 'John',
 'Acts',
 'Romans',
 '1_Corinthians',
 '2_Corinthians',
 'Galatians',
 'Ephesians',
 'Philippians',
 'Colossians',
 '1_Thessalonians',
 '2_Thessalonians',
 '1_Timothy',
 '2_Timothy',
 'Titus',
 'Philemon',
 'Hebrews',
 'James',
 '1_Peter',
 '2_Peter',
 '1_John',
 '2_John',
 '3_John',
 'Jude',
 'Revelation']

### Custom functions for Bible text retrieval and formatting

Text retrieval was the most difficult part of the project. Each API differs significantly and the format of the text returned made this even more difficult.

In [27]:
## Custom functions

def remove_nonalnum_lead_trail(s):
    try:
        if not s.isalnum():
            if not s[-1].isalnum():
                s = s.strip(s[-1])
            if not s[0].isalnum():
                s = s.strip(s[0])
        return s
    except:
        print(f"error has occurred.  Word: {s}")

def remove_punct(list_of_words=None):
    """
        Function to remove punctuations characters from list of words passed in.
    """
    updated_list_of_words = []
    
    additional_chars = list('､—’')
    
    punctuation_list = list(string.punctuation)
    punctuation_list += additional_chars
    
#     print(punctuation_list)
    
    if list_of_words:
        updated_list_of_words = [item for item in list_of_words if item not in punctuation_list]
    return updated_list_of_words

def remove_stop_words(list_of_words=None):
    pass
    
def parse_book(book_dict=None, tokenize_verses=False):
    b_name = book_dict['book_name']
    chapters = book_dict['book'].keys() #get chapter numbers
    parsed_book = {}
    
    #Loop through chapter numbers and get verse content
    for c_num in chapters:        
        chapter = book_dict['book'][c_num]['chapter']
        verses = list(chapter.keys())
        
        #Loop through verses & parse content
        for v_num in verses:
            verse = chapter[v_num]['verse'].strip()
            if tokenized_verses:
                tokenized_verse = remove_punct(word_tokenize(verse))
                parsed_book[f'{b_name} {c_num}:{v_num}'] = tokenized_verse            
            else:
                parsed_book[f'{b_name} {c_num}:{v_num}'] = verse
    return parsed_book

def parse_formatted_verse_ref(text='', tokenize_verses=False):
    """
        This method takes in fully formatted bible verse text and stores it in a dictionary.
        Passing text that is too large into this function may cause issues within Jupyter notebook.  Be advised
            
        Argument:
            
            text {str}:
                Formatted Bible text sea
        
        Return:
            parsed_dict {dict}:
                Dictionary containing parsed verses with the information in the following format
                    
                    key {str}: {Book}{chapter}:{verse}
                    value {ist}: tokenized bible verse text

    """
    search = re.findall(r'^(.*:\d+)(.*)', text.strip(), re.M|re.I)
    if tokenize_verses:
        search_dict = {item[0]: remove_punct(word_tokenize(item[1].strip())) for item in search}
    else:
        search_dict = {item[0]: item[1].strip() for item in search}
    return search_dict
    

### Bible.Org API

This site only has one available version. Pulling from this one should not take too long to accomplish

|Available Bible Version | Version ID |
| :---:    |  :----:   |
|New English Translation | _NET_|

In [8]:
bible_org_available_version = ['NET']

### _Biblia API_

This site is very useful and hosts a strong amount of services for pulling Bible text from the website.

Available Bible Version |	Version ID
:---:     |:----:
American Standard Version |	ASV
Authorized Version |	KJV
1890 Darby Bible |	DARBY
The Emphasized Bible |	EMPHBBL
King James Version |	KJV1900
The King James Version Apocrypha |	KJVAPOC
The Lexham English Bible |	LEB
Young’s Literal Translation |	YLT


In [38]:
biblia_available_versions = ['KJV1900','LEB','EMPHBBL','DARBY']

To get the content from a specific bible, the `content` service of the API has to be used.  Selecting different services is pretty simple.  To do so, you enter the name of the service as a path, within a directory as seen below.  

Pull the text for each book from website and store in dictionary for later use

In [39]:
my_api = My_API(url="https://api.biblia.com/v1/bible/", key='fd37d8f28e95d3be8cb4fbc37e15e18e')

biblia_versions = {}

for v in biblia_available_versions:
    print(v)
    ver = {}
    for book in canonical_books:

        query = f'content/{v}.txt?passage={book}&style=oneVersePerLineFullReference'
        resp = my_api.run_query(query)

        book_dict = parse_formatted_verse_ref(resp)
        ver.update(book_dict)
        
    versions[v] = ver

KJV1900
---------------------------------------------------------------
Error converting response to JSON format. Returning as a string
---------------------------------------------------------------
---------------------------------------------------------------
Error converting response to JSON format. Returning as a string
---------------------------------------------------------------
---------------------------------------------------------------
Error converting response to JSON format. Returning as a string
---------------------------------------------------------------
---------------------------------------------------------------
Error converting response to JSON format. Returning as a string
---------------------------------------------------------------
---------------------------------------------------------------
Error converting response to JSON format. Returning as a string
---------------------------------------------------------------
--------------------------------

---------------------------------------------------------------
Error converting response to JSON format. Returning as a string
---------------------------------------------------------------
---------------------------------------------------------------
Error converting response to JSON format. Returning as a string
---------------------------------------------------------------
---------------------------------------------------------------
Error converting response to JSON format. Returning as a string
---------------------------------------------------------------
---------------------------------------------------------------
Error converting response to JSON format. Returning as a string
---------------------------------------------------------------
---------------------------------------------------------------
Error converting response to JSON format. Returning as a string
---------------------------------------------------------------
----------------------------------------

---------------------------------------------------------------
Error converting response to JSON format. Returning as a string
---------------------------------------------------------------
---------------------------------------------------------------
Error converting response to JSON format. Returning as a string
---------------------------------------------------------------
---------------------------------------------------------------
Error converting response to JSON format. Returning as a string
---------------------------------------------------------------
---------------------------------------------------------------
Error converting response to JSON format. Returning as a string
---------------------------------------------------------------
---------------------------------------------------------------
Error converting response to JSON format. Returning as a string
---------------------------------------------------------------
----------------------------------------

---------------------------------------------------------------
Error converting response to JSON format. Returning as a string
---------------------------------------------------------------
---------------------------------------------------------------
Error converting response to JSON format. Returning as a string
---------------------------------------------------------------
---------------------------------------------------------------
Error converting response to JSON format. Returning as a string
---------------------------------------------------------------
EMPHBBL
---------------------------------------------------------------
Error converting response to JSON format. Returning as a string
---------------------------------------------------------------
---------------------------------------------------------------
Error converting response to JSON format. Returning as a string
---------------------------------------------------------------
--------------------------------

---------------------------------------------------------------
Error converting response to JSON format. Returning as a string
---------------------------------------------------------------
---------------------------------------------------------------
Error converting response to JSON format. Returning as a string
---------------------------------------------------------------
---------------------------------------------------------------
Error converting response to JSON format. Returning as a string
---------------------------------------------------------------
---------------------------------------------------------------
Error converting response to JSON format. Returning as a string
---------------------------------------------------------------
---------------------------------------------------------------
Error converting response to JSON format. Returning as a string
---------------------------------------------------------------
----------------------------------------

---------------------------------------------------------------
Error converting response to JSON format. Returning as a string
---------------------------------------------------------------
---------------------------------------------------------------
Error converting response to JSON format. Returning as a string
---------------------------------------------------------------
---------------------------------------------------------------
Error converting response to JSON format. Returning as a string
---------------------------------------------------------------
---------------------------------------------------------------
Error converting response to JSON format. Returning as a string
---------------------------------------------------------------
---------------------------------------------------------------
Error converting response to JSON format. Returning as a string
---------------------------------------------------------------
----------------------------------------

---------------------------------------------------------------
Error converting response to JSON format. Returning as a string
---------------------------------------------------------------
---------------------------------------------------------------
Error converting response to JSON format. Returning as a string
---------------------------------------------------------------
---------------------------------------------------------------
Error converting response to JSON format. Returning as a string
---------------------------------------------------------------
---------------------------------------------------------------
Error converting response to JSON format. Returning as a string
---------------------------------------------------------------
---------------------------------------------------------------
Error converting response to JSON format. Returning as a string
---------------------------------------------------------------
----------------------------------------

In [40]:
versions.keys()

dict_keys(['KJV', 'AKJV', 'ASV', 'BASICENGLISH', 'DOUAYRHEIMS', 'WB', 'WEB', 'YLT', 'KJV1900', 'LEB', 'EMPHBBL', 'DARBY'])

### GetBible.net

Below are the English BIble Versions available at this website:
English 

In [10]:
getbible_eng_versions = ['KJV','AKJV','ASV','BASICENGLISH','DOUAYRHEIMS', 'WB','WEB','YLT']

In [11]:
my_api = My_API(url="https://getbible.net/")

for v in getbible_eng_versions:
    v_dict = {}
    print(v)
    for book in canonical_books:
        resp = my_api.run_query(f'json?passage={book}&v={v.lower()}')
        resp_dict = json.loads(resp[1:-2])
        v_dict.update(parse_book(resp_dict))
    versions[v] = v_dict

KJV
---------------------------------------------------------------
Error converting response to JSON format. Returning as a string
---------------------------------------------------------------
---------------------------------------------------------------
Error converting response to JSON format. Returning as a string
---------------------------------------------------------------
---------------------------------------------------------------
Error converting response to JSON format. Returning as a string
---------------------------------------------------------------
---------------------------------------------------------------
Error converting response to JSON format. Returning as a string
---------------------------------------------------------------
---------------------------------------------------------------
Error converting response to JSON format. Returning as a string
---------------------------------------------------------------
------------------------------------

---------------------------------------------------------------
Error converting response to JSON format. Returning as a string
---------------------------------------------------------------
---------------------------------------------------------------
Error converting response to JSON format. Returning as a string
---------------------------------------------------------------
---------------------------------------------------------------
Error converting response to JSON format. Returning as a string
---------------------------------------------------------------
---------------------------------------------------------------
Error converting response to JSON format. Returning as a string
---------------------------------------------------------------
---------------------------------------------------------------
Error converting response to JSON format. Returning as a string
---------------------------------------------------------------
----------------------------------------

---------------------------------------------------------------
Error converting response to JSON format. Returning as a string
---------------------------------------------------------------
---------------------------------------------------------------
Error converting response to JSON format. Returning as a string
---------------------------------------------------------------
---------------------------------------------------------------
Error converting response to JSON format. Returning as a string
---------------------------------------------------------------
---------------------------------------------------------------
Error converting response to JSON format. Returning as a string
---------------------------------------------------------------
---------------------------------------------------------------
Error converting response to JSON format. Returning as a string
---------------------------------------------------------------
----------------------------------------

---------------------------------------------------------------
Error converting response to JSON format. Returning as a string
---------------------------------------------------------------
---------------------------------------------------------------
Error converting response to JSON format. Returning as a string
---------------------------------------------------------------
---------------------------------------------------------------
Error converting response to JSON format. Returning as a string
---------------------------------------------------------------
ASV
---------------------------------------------------------------
Error converting response to JSON format. Returning as a string
---------------------------------------------------------------
---------------------------------------------------------------
Error converting response to JSON format. Returning as a string
---------------------------------------------------------------
------------------------------------

---------------------------------------------------------------
Error converting response to JSON format. Returning as a string
---------------------------------------------------------------
---------------------------------------------------------------
Error converting response to JSON format. Returning as a string
---------------------------------------------------------------
---------------------------------------------------------------
Error converting response to JSON format. Returning as a string
---------------------------------------------------------------
---------------------------------------------------------------
Error converting response to JSON format. Returning as a string
---------------------------------------------------------------
---------------------------------------------------------------
Error converting response to JSON format. Returning as a string
---------------------------------------------------------------
----------------------------------------

---------------------------------------------------------------
Error converting response to JSON format. Returning as a string
---------------------------------------------------------------
---------------------------------------------------------------
Error converting response to JSON format. Returning as a string
---------------------------------------------------------------
---------------------------------------------------------------
Error converting response to JSON format. Returning as a string
---------------------------------------------------------------
---------------------------------------------------------------
Error converting response to JSON format. Returning as a string
---------------------------------------------------------------
---------------------------------------------------------------
Error converting response to JSON format. Returning as a string
---------------------------------------------------------------
----------------------------------------

---------------------------------------------------------------
Error converting response to JSON format. Returning as a string
---------------------------------------------------------------
---------------------------------------------------------------
Error converting response to JSON format. Returning as a string
---------------------------------------------------------------
---------------------------------------------------------------
Error converting response to JSON format. Returning as a string
---------------------------------------------------------------
---------------------------------------------------------------
Error converting response to JSON format. Returning as a string
---------------------------------------------------------------
---------------------------------------------------------------
Error converting response to JSON format. Returning as a string
---------------------------------------------------------------
----------------------------------------

---------------------------------------------------------------
Error converting response to JSON format. Returning as a string
---------------------------------------------------------------
---------------------------------------------------------------
Error converting response to JSON format. Returning as a string
---------------------------------------------------------------
---------------------------------------------------------------
Error converting response to JSON format. Returning as a string
---------------------------------------------------------------
---------------------------------------------------------------
Error converting response to JSON format. Returning as a string
---------------------------------------------------------------
---------------------------------------------------------------
Error converting response to JSON format. Returning as a string
---------------------------------------------------------------
----------------------------------------

---------------------------------------------------------------
Error converting response to JSON format. Returning as a string
---------------------------------------------------------------
---------------------------------------------------------------
Error converting response to JSON format. Returning as a string
---------------------------------------------------------------
---------------------------------------------------------------
Error converting response to JSON format. Returning as a string
---------------------------------------------------------------
---------------------------------------------------------------
Error converting response to JSON format. Returning as a string
---------------------------------------------------------------
---------------------------------------------------------------
Error converting response to JSON format. Returning as a string
---------------------------------------------------------------
----------------------------------------

---------------------------------------------------------------
Error converting response to JSON format. Returning as a string
---------------------------------------------------------------
---------------------------------------------------------------
Error converting response to JSON format. Returning as a string
---------------------------------------------------------------
---------------------------------------------------------------
Error converting response to JSON format. Returning as a string
---------------------------------------------------------------
---------------------------------------------------------------
Error converting response to JSON format. Returning as a string
---------------------------------------------------------------
---------------------------------------------------------------
Error converting response to JSON format. Returning as a string
---------------------------------------------------------------
----------------------------------------

---------------------------------------------------------------
Error converting response to JSON format. Returning as a string
---------------------------------------------------------------
---------------------------------------------------------------
Error converting response to JSON format. Returning as a string
---------------------------------------------------------------
---------------------------------------------------------------
Error converting response to JSON format. Returning as a string
---------------------------------------------------------------
---------------------------------------------------------------
Error converting response to JSON format. Returning as a string
---------------------------------------------------------------
---------------------------------------------------------------
Error converting response to JSON format. Returning as a string
---------------------------------------------------------------
----------------------------------------

---------------------------------------------------------------
Error converting response to JSON format. Returning as a string
---------------------------------------------------------------
---------------------------------------------------------------
Error converting response to JSON format. Returning as a string
---------------------------------------------------------------
---------------------------------------------------------------
Error converting response to JSON format. Returning as a string
---------------------------------------------------------------
---------------------------------------------------------------
Error converting response to JSON format. Returning as a string
---------------------------------------------------------------
---------------------------------------------------------------
Error converting response to JSON format. Returning as a string
---------------------------------------------------------------
----------------------------------------

---------------------------------------------------------------
Error converting response to JSON format. Returning as a string
---------------------------------------------------------------
---------------------------------------------------------------
Error converting response to JSON format. Returning as a string
---------------------------------------------------------------
---------------------------------------------------------------
Error converting response to JSON format. Returning as a string
---------------------------------------------------------------
---------------------------------------------------------------
Error converting response to JSON format. Returning as a string
---------------------------------------------------------------
---------------------------------------------------------------
Error converting response to JSON format. Returning as a string
---------------------------------------------------------------
----------------------------------------

In [12]:
versions.keys()

dict_keys(['KJV', 'AKJV', 'ASV', 'BASICENGLISH', 'DOUAYRHEIMS', 'WB', 'WEB', 'YLT'])

In [17]:
versions['KJV']['Genesis 1:1'].strip()

'In the beginning God created the heaven and the earth.'

# Save data to file

Here, we are saving the dictionary as a json object to a file, for easier access later on

In [41]:
with open("./data/bibles_formatted.json", "w") as f:
    js = json.dump(versions, fp=f, indent=4, separators=(',', ': '))

### Pull data from saved file

In [42]:
with open("./data/bibles_formatted.json") as f:
    data = json.load(f)

In [43]:
data.keys()

dict_keys(['KJV', 'AKJV', 'ASV', 'BASICENGLISH', 'DOUAYRHEIMS', 'WB', 'WEB', 'YLT', 'KJV1900', 'LEB', 'EMPHBBL', 'DARBY'])

In [46]:
data['YLT']

{'Genesis 1:1': "In the beginning of God's preparing the heavens and the  earth --\r\n",
 'Genesis 1:2': 'the earth hath existed waste and void, and darkness [is] on  the face of the deep, and the Spirit of God fluttering on the  face of the waters,\r\n',
 'Genesis 1:3': "and God saith, `Let light be;' and light is.\r\n",
 'Genesis 1:4': 'And God seeth the light that [it is] good, and God  separateth between the light and the darkness,\r\n',
 'Genesis 1:5': "and God calleth to the light `Day,' and to the darkness He  hath called `Night;' and there is an evening, and there is a  morning -- day one.\r\n",
 'Genesis 1:6': "And God saith, `Let an expanse be in the midst of the  waters, and let it be separating between waters and waters.'\r\n",
 'Genesis 1:7': 'And God maketh the expanse, and it separateth between the  waters which [are] under the expanse, and the waters which  [are] above the expanse: and it is so.\r\n',
 'Genesis 1:8': "And God calleth to the expanse `Heavens;' and there 

# General Statistics

Here, we will make a dataframe out of the parsed text to get a better idea of how the information is formatted

In [142]:
list_of_df = []
for key in data.keys():
    df = pd.DataFrame.from_dict(data[key], orient='index', columns=['text'])
    df['version'] = key
    list_of_df.append(df)
    
df = pd.concat(list_of_df)

In [145]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 372496 entries, Genesis 1:1 to Revelation 22:21
Data columns (total 2 columns):
text       372496 non-null object
version    372496 non-null object
dtypes: object(2)
memory usage: 8.5+ MB


In [146]:
df['char_count'] = df['text'].apply(len)
df['word_count'] = df['text'].apply(lambda x: len(x.split()))
df['word_density'] = df['char_count'] / (df['word_count']+1)
df['punctuation_count'] = df['text'].apply(lambda x: len("".join(_ for _ in x if _ in string.punctuation))) 
df['title_word_count'] = df['text'].apply(lambda x: len([wrd for wrd in x.split() if wrd.istitle()]))
df['upper_case_word_count'] = df['text'].apply(lambda x: len([wrd for wrd in x.split() if wrd.isupper()]))

In [147]:
gen_series = (df.filter(regex='Genesis', axis=0))

In [148]:
gen_series.mean().plot()

KeyboardInterrupt: 

In [108]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 31102 entries, Genesis 1:1 to Revelation 22:21
Data columns (total 7 columns):
text                     31102 non-null object
char_count               31102 non-null int64
word_count               31102 non-null int64
word_density             31102 non-null float64
punctuation_count        31102 non-null int64
title_word_count         31102 non-null int64
upper_case_word_count    31102 non-null int64
dtypes: float64(1), int64(5), object(1)
memory usage: 1.9+ MB


# Text Formatting

Now that the bible version text has been stored, t

In [None]:
from nltk.stem.wordnet import WordNetLemmatizer

In [None]:
import nltk
nltk.download('wordnet')

In [None]:
lemmatizer = WordNetLemmatizer()

In [None]:
lemmatizer.lemmatize(data['EMPHBBL'].values())

In [None]:
# nltk.download('stopwords')
stopwords_list = stopwords.words('english')

In [None]:
stopwords_list += string.punctuation
stopwords_list

# Modeling

### TF-IDF Vectorization

In [None]:
## These are functions taken from Flat Iron Curriculum

def count_vectorize(line, vocab=None):

    if vocab:  
        for word in line:
            if word in vocab:
                vocab[word] += 1
            else: 
                vocab[word] = 1
        return vocab
        
    else:
        unique_words = list(set(line))

        text_dict = {i:0 for i in unique_words}
        
        for word in line:
            if word in text_dict:
                text_dict[word] += 1
            else :
                text_dict[word] = 1    
        
        return text_dict

def term_frequency(BoW_dict):
    total_word_count = sum(BoW_dict.values())
    
    for ind, val in BoW_dict.items():
        BoW_dict[ind] = val/ total_word_count
    
    return BoW_dict


def inverse_document_frequency(list_of_dicts):
    vocab_set = set()
    # Iterate through list of dfs and add index to vocab_set
    for d in list_of_dicts:
        for word in d.keys():
            vocab_set.add(word)
    
    # Once vocab set is complete, create an empty dictionary with a key for each word and value of 0.
    full_vocab_dict = {i:0 for i in vocab_set}
    
    # Loop through each word in full_vocab_dict
    for word, val in full_vocab_dict.items():
        docs = 0
        
        # Loop through list of dicts.  Each time a dictionary contains the word, increment docs by 1
        for d in list_of_dicts:
            if word in d:
                docs += 1
        
        # Now that we know denominator for equation, compute and set IDF value for word
        
        full_vocab_dict[word] = np.log((len(list_of_dicts)/ float(docs)))
    
    return full_vocab_dict


def tf_idf(list_of_dicts):
    
    # Create empty dictionary containing full vocabulary of entire corpus
    doc_tf_idf = {}
    idf = inverse_document_frequency(list_of_dicts)
    full_vocab_list = {i:0 for i in list(idf.keys())}
    
    # Create tf-idf list of dictionaries, containing a dictionary that will be updated for each document
    tf_idf_list_of_dicts = []
    
    # Now, compute tf and then use this to compute and set tf-idf values for each document
    for doc in list_of_dicts:
        doc_tf = term_frequency(doc)
        for word in doc_tf:
            doc_tf_idf[word] = doc_tf[word] * idf[word]
        tf_idf_list_of_dicts.append(doc_tf_idf)
    
    return tf_idf_list_of_dicts

In [None]:
list_of_verses = list(data['ASV'].values())

In [None]:
vocab_count = {}
for verse in list_of_verses:
     vocab_count = count_vectorize(verse, vocab_count)
        
vocab_count

In [None]:
term_freq = term_frequency(vocab_count)

In [None]:
inverse_document_frequency([term_freq])
tf_idf([term_freq])

In [None]:
for key in data:
    list_of_verses = list(data[key].values())
    
    vocab_count = {}
    for verse in list_of_verses:
         vocab_count = count_vectorize(verse, vocab_count)
    term_freq = term_frequency(vocab_count)
    tf_idf()

### TF-idf vectorizer using sklearn

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer, CountVectorizer

In [None]:
list_of_verses = [' '.join(val) for val in data['KJV'].values()]

In [None]:
cv = CountVectorizer()
word_count_vector = cv.fit_transform(list_of_verses)

In [None]:
word_count_vector.shape

__Compute the IDF values__

In [None]:
tfidf_transformer = TfidfTransformer(smooth_idf=True, use_idf=True)
tfidf_transformer.fit(word_count_vector)

In [None]:
# print idf values
df_idf = pd.DataFrame(tfidf_transformer.idf_, index=cv.get_feature_names(),columns=["idf_weights"])
 
# sort ascending
df_idf.sort_values(by=['idf_weights'])

__Compute the TFIDF score for the document__

In [None]:
# count matrix
count_vector=cv.transform(list_of_verses)
 
# tf-idf scores
tf_idf_vector=tfidf_transformer.transform(count_vector)

In [None]:
feature_names = cv.get_feature_names()
 
#get tfidf vector for first document
first_document_vector=tf_idf_vector[0]
 
#print the scores
df = pd.DataFrame(first_document_vector.T.todense(), index=feature_names, columns=["tfidf"])
df.sort_values(by=["tfidf"],ascending=False)

In [None]:
tf_idf_vector[:,:].T.todense()

### Word2Vector

#### Passing Tokenized text into Word2Vec Model

In [None]:
# versions['KJV'].values()

In [None]:
model = Word2Vec(data['EMPHBBL'].values(), size=100, window=5, min_count=1, workers=4)

In [None]:
model.train(data['EMPHBBL'].values(), total_examples=model.corpus_count, epochs=10)

In [None]:
wv = model.wv

In [None]:
wv.most_similar(positive=['Lord','man','woman'], negative=['death'], topn=20)

In [None]:
versions_wv_dict = {}

In [None]:
for key in data.keys():
    model = Word2Vec(data[key].values(), size=100, window=5, min_count=1, workers=4)
    model.train(data[key].values(), total_examples=model.corpus_count, epochs=10)
    versions_wv_dict[key] = model.wv

In [None]:
words_to_check = ['salvation','resurrection','healing', 'redemption','hope','joy','peace']

In [None]:
for key in versions_wv_dict.keys():
    print(f"{key}")
    similar_words = versions_wv_dict[key].most_similar(positive=['grace'], topn=10)
    for item in similar_words:
        print(item)
    print('\n\n')

In [None]:
for key in versions_wv_dict.keys():
    print(f"{key}")
    similar_words = versions_wv_dict[key].most_similar(negative=['hope'], topn=10)
    for item in similar_words:
        print(item)
    print('\n\n')

### GLoVe Modeling

In [None]:
tokenized_set = set(tokenized_text)
glove = {}
with open('../glove.6B/glove.6B.100d.txt', 'rb') as f:
    for line in f:
        parts = line.split()
        word = parts[0].decode('utf-8')
        if word in tokenized_set:
            vector = np.array(parts[1:], dtype=np.float32)
            glove[word] = vector

In [None]:
print(string.punctuation)