<a href="https://colab.research.google.com/github/ryderwishart/biblical-machine-learning/blob/main/tfidf_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Import dependencies

In [64]:
import os
import codecs
import time
try:
    from unidecode import unidecode
    from collections.abc import Mapping
    from gensim.models.tfidfmodel import TfidfModel
    from gensim.corpora import Dictionary
except:
#     print("Depencies not found. Make sure you have installed GenSim.")
    !pip install -I gensim unidecode
#     !pip install -Iv gensim==3.2.0
    from collections.abc import Mapping
    from unidecode import unidecode
    from gensim.models.tfidfmodel import TfidfModel

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting gensim
  Using cached gensim-4.3.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.5 MB)
Collecting unidecode
  Using cached Unidecode-1.3.6-py3-none-any.whl (235 kB)
Collecting smart-open>=1.8.1
  Using cached smart_open-6.3.0-py3-none-any.whl (56 kB)
Collecting numpy>=1.18.5
  Using cached numpy-1.24.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.3 MB)
Collecting scipy>=1.7.0
  Using cached scipy-1.10.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (34.5 MB)
Installing collected packages: unidecode, smart-open, numpy, scipy, gensim
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
numba 0.56.4 requires numpy<1.24,>=1.18, but you have numpy 1.24.2 which is incompatible.[0m[31m
[0mSuccessfully installed gens

In [45]:
if 'biblical-machine-learning' not in [path for path in os.listdir()]:
    !git clone https://github.com/ryderwishart/biblical-machine-learning.git

Cloning into 'biblical-machine-learning'...
remote: Enumerating objects: 1321, done.[K
remote: Counting objects: 100% (37/37), done.[K
remote: Compressing objects: 100% (24/24), done.[K
remote: Total 1321 (delta 11), reused 35 (delta 11), pack-reused 1284[K
Receiving objects: 100% (1321/1321), 63.78 MiB | 15.52 MiB/s, done.
Resolving deltas: 100% (40/40), done.
Updating files: 100% (1302/1302), done.


In [46]:
corpus_directories = [path for path in os.listdir('biblical-machine-learning/data') if not(path.startswith('.'))]
print('Directories found in data folder:', corpus_directories)

Directories found in data folder: ['texts', 'lemmas']


## Locate corpus data

In [72]:
force_lowercase = True
use_lemma_disambiguation = False # Some lemmas are indicated with a numeric suffix (e.g., 'ὅτι2')

# Change to the directory entered (this is necessary to use the codecs.open() method). 
# TODO: rewrite this corpus iterator without the codecs module.

# if not(os.getcwd().split('/')[-1].endswith('corpus')):
#     os.chdir(corpus_directory)

# This class streams through the corpus when called.

perseus_stopwords = "μή, ἑαυτοῦ, ἄν, ἀλλ', ἀλλά, ἄλλος, ἀπό, ἄρα, αὐτός, δ', δέ, δή, διά, δαί, δαίς, ἔτι, ἐγώ, ἐκ, ἐμός, ἐν, ἐπί, εἰ, εἰμί, εἴμι, εἰς, γάρ, γε, γα, ἡ, ἤ, καί, κατά, μέν, μετά, μή, ὁ, ὅδε, ὅς, ὅστις, ὅτι, οὕτως, οὗτος, οὔτε, οὖν, οὐδείς, οἱ, οὐ, οὐδέ, οὐκ, περί, πρός, σύ, σύν, τά, τε, τήν, τῆς, τῇ, τι, τί, τις, τίς, τό, τοί, τοιοῦτος, τόν, τούς, τοῦ, τῶν, τῷ, ὑμός, ὑπέρ, ὑπό, ὡς, ὦ, ὥστε, ἐάν, παρά, σός".split(', ')
perseus_stopwords += "συ δ μοι".split(' ')
perseus_stopwords = [unidecode(w) for w in perseus_stopwords]

def tokenize(string):
    output = string
    if use_lemma_disambiguation:
        pass
    else:
        # Filter numeric digits from token
        output = ''.join(filter(lambda x: x.isalpha() or x == ' ', string))
    if force_lowercase:
        return [token.lower() for token in output.split() if unidecode(token.lower()) not in perseus_stopwords] # use unidecode to strip accents temporarily
    else:
        return output.split()
    
class Texts:
    def __init__(self, selected_corpus):
        self.selected_corpus = selected_corpus

    def __iter__(self):
        for file in os.listdir(f'biblical-machine-learning/data/{self.selected_corpus}'): 
            if file.endswith(".txt"):
                text = []
                for line in codecs.open(f'biblical-machine-learning/data/{self.selected_corpus}/{file}', 'r+'):
                    tokens = tokenize(line)
                    if len(tokens) > 1: # skip one-word lines, since these are often enumerations
                        text += tokens 
                if len(text) > 1: # skip one-word texts, if they exist
                  yield text

## Create corpus and dictionary

In [73]:
texts = Texts('texts')

In [74]:
# Output should resemble: ['ἐν', 'ὁ', 'πρότερος', 'ὅτι2', 'εὔχομαι', 'νύξ', 'καί', 'ἡμέρα', 'ὁράω', ...etc.
count = 0
for i in texts:
    print(i[0:10])
    break

['μαξίμου', 'καταρχῶν', 'μεταφρασθὲν', 'πεζῇ', 'λέξει', 'ἡρωικῶν', 'μέτρων', 'ἄγε', 'κούρη', 'πιμπληιὰς']


In [75]:
print("Generating dictionary . . . ")
start = time.time()
dictionary = Dictionary(texts)
print("\nDictionary initialized in {0:.2f} seconds.".format(time.time() - start))

Generating dictionary . . . 

Dictionary initialized in 86.41 seconds.


In [76]:
print("Generating corpus . . . ")
start = time.time()
corpus = [dictionary.doc2bow(text) for text in texts]
print("\nCorpus initialized in {0:.2f} seconds.".format(time.time() - start))

Generating corpus . . . 

Corpus initialized in 80.50 seconds.


## Train model

In [77]:
print("Generating model . . . ")
start = time.time()
model = TfidfModel(corpus)
# Create a list of all the unique words in the corpus, in case user wants to query all words.
#     words_seen = set() # holds lines already seen
#     allWords = []
#     for line in sentences:
#         for word in line:
#             if word not in words_seen: # not a duplicate
#                 allWords.append(word)
#                 words_seen.add(word)   

print("\nModel initialized in {0:.2f} seconds.".format(time.time() - start))

Generating model . . . 

Model initialized in 2.33 seconds.


## Query model

In [94]:
input_text = "Ἐν ἀρχῇ ἦν ὁ λόγος, καὶ ὁ λόγος ἦν πρὸς τὸν θεόν, καὶ θεὸς ἦν ὁ λόγος."

In [96]:
input_tokens = [w for w in tokenize(input_text)]
input_bow = dictionary.doc2bow(input_tokens)
input_tfidf = model[input_bow]
summary = sorted(input_tfidf, key=lambda x: x[1], reverse=True)[:10]
print('Most significant words in input text: ')
for result in summary:
    id, score = result
    token = dictionary[id]
    print(f'{score:.2f}: {token}')

Most significant words in input text: 
0.58: θεόν
0.54: λόγος
0.47: ἀρχῇ
0.40: θεὸς
