<a href="https://colab.research.google.com/github/ryderwishart/biblical-machine-learning/blob/main/semantic_search_mvp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install gensim

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [30]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.corpora.dictionary import Dictionary
import os
import re

In [9]:
# Download the KJV text file
filename = 'pg10.txt'
if filename not in [path for path in os.listdir()]:
    !wget -q 'https://www.gutenberg.org/cache/epub/10/pg10.txt'
os.listdir()

['.config', 'pg10.txt', 'sample_data']

In [72]:
# stop_words = ['the', 'and', 'of', 'to', 'And', 'that', 'in', 'shall', 'he', 'unto', 'I', 'his', 'a', 'for', 'they', 'be', 'is', 'him', 'not', 'them', 'with', 'it', 'all', 'thou', 'was', 'thy', 'which', 'my', 'me', 'said', 'their', 'have', 'thee', 'will', 'ye', 'from', 'as', 'are', 'were', 'out', 'upon', 'you', 'by', 'when', 'this', 'but']

In [53]:
# Load the KJV text file and extract the sentences
sentences = []
with open(filename, 'r') as f:
    # open the whole text so the verses can be properly split
    text = f.read()
    paragraphs = text.split('\n\n') # blank line for paragraph
    for paragraph in paragraphs:
        # Extract paragraphs that begin with a verse reference
        if re.match(r'\d+:\d+', paragraph):
            # Clean the sentence by removing the verse reference number and punctuation
            sentence = paragraph.replace('\n', ' ').strip().split(' ')[1:]
            sentence = [word.strip('.,!?;:-') for word in sentence]
            sentences.append(sentence)

print('Sentences in corpus: ', len(sentences))

Sentences in corpus:  24337


In [73]:
# Count the frequency of each word in the corpus
word_counts = {}
for sentence in sentences:
    for word in sentence:
        if word not in word_counts:
            word_counts[word] = 1
        else:
            word_counts[word] += 1

stop_words = [word for word, count in sorted(word_counts.items(), key=lambda item: item[1], reverse=True)[:50]]
exceptions = ['LORD', 'Israel', 'man', 'God']
stop_words = [word for word in stop_words if word not in exceptions]
print(stop_words)

['the', 'and', 'of', 'to', 'And', 'that', 'in', 'shall', 'he', 'unto', 'I', 'his', 'a', 'for', 'they', 'be', 'is', 'him', 'not', 'them', 'with', 'it', 'all', 'thou', 'was', 'thy', 'which', 'my', 'me', 'said', 'their', 'have', 'thee', 'will', 'ye', 'from', 'as', 'are', 'were', 'out', 'upon', 'you', 'by', 'when', 'this', 'but']


In [74]:
# Build the vocabulary of words that appear in the corpus
dictionary = Dictionary(sentences)

In [88]:
# Train the Doc2Vec model on the sentences
documents = [TaggedDocument(
    words=[word for word in sentence if word not in stop_words], 
    tags=[str(i)]) for i, sentence in enumerate(sentences)
]
model = Doc2Vec(
    corpus=documents, 
    dictionary=dictionary,
    vector_size=100, 
    window=5, 
    min_count=5, 
    workers=4, 
    epochs=2
)
model.build_vocab(documents)

In [89]:
model.train(documents, total_examples=model.corpus_count, epochs=model.epochs)

In [105]:
# Get the five most similar sentences to an input sentence
input_sentence = "Love your neighbor"
vector = model.infer_vector(input_sentence.split())
similar_sentences = model.docvecs.most_similar(positive=[vector], topn=10)

In [106]:
# Print the similar sentences
print('*NOTE: this model is trained on a tiny, toy corpus (the KJV). The results cannot be good, and this should serve as a proof of concept only.\n')
for index, similarity in similar_sentences:
    print("{:.2f}".format(similarity), ' '.join(sentences[int(index)]))

*NOTE: this model is trained on a tiny, toy corpus (the KJV). The results cannot be good, and this should serve as a proof of concept only.

0.83 Remember I beseech thee that thou hast made me as the clay and wilt thou bring me into dust again  10:10 Hast thou not poured me out as milk and curdled me like cheese  10:11 Thou hast clothed me with skin and flesh and hast fenced me with bones and sinews
0.83 And the people answered and said God forbid that we should forsake the LORD to serve other gods 24:17 For the LORD our God he it is that brought us up and our fathers out of the land of Egypt from the house of bondage and which did those great signs in our sight and preserved us in all the way wherein we went and among all the people through whom we passed 24:18 And the LORD drave out from before us all the people even the Amorites which dwelt in the land therefore will we also serve the LORD for he is our God
0.83 This is the inheritance of the tribe of the children of Naphtali accord