## Import dependencies

In [2]:
import os
import codecs
import time
try:
    from collections.abc import Mapping
    from gensim.models.word2vec import Word2Vec
except:
#     print("Depencies not found. Make sure you have installed GenSim.")
    !pip install -I gensim
#     !pip install -Iv gensim==3.2.0
    from collections.abc import Mapping
    from gensim.models.word2vec import Word2Vec

## Locate corpus data

In [42]:
corpus_directory = './data/corpus'
force_lowercase = True
use_lemma_disambiguation = False # Some lemmas are indicated with a numeric suffix (e.g., 'ὅτι2')

# Change to the directory entered (this is necessary to use the codecs.open() method). 
# TODO: rewrite this corpus iterator without the codecs module.

if not(os.getcwd().split('/')[-1].endswith('corpus')):
    os.chdir(corpus_directory)

# This class streams through the corpus when called.

def tokenize(string):
    output = string
    if use_lemma_disambiguation:
        pass
    else:
        # Filter numeric digits from token
        output = ''.join(filter(lambda x: not x.isdigit(), string))
    if force_lowercase:
        return [token.lower() for token in output.split()]
    else:
        return output.split()
    
class MySentences(object):
    def __iter__(self):
        for file in os.listdir('.'): # the directory where the text files are.
            if file.endswith(".txt"):
                for line in codecs.open(file, 'r+'):
                    tokens = tokenize(line)
                    if len(tokens) > 0:
                        yield tokens

## Instantiate the corpus streamer

In [43]:
sentences = MySentences()

## Sanity check

In [44]:
# Output should resemble: ['ἐν', 'ὁ', 'πρότερος', 'ὅτι2', 'εὔχομαι', 'νύξ', 'καί', 'ἡμέρα', 'ὁράω', ...etc.
count = 0
for i in sentences:
    count += 1
    print(i)
    if count > 5:
        break

['ἐν', 'ὁ', 'πρότερος', 'ὅτι', 'εὔχομαι', 'νύξ', 'καί', 'ἡμέρα', 'ὁράω', 'καί', 'ὅτι', 'οὐ', 'στέγω', 'ἀλλά', 'ἐν', 'ἀθήνη', 'μόνος', 'καί', 'ὅτι', 'πέμπω', 'τιμόθεος', 'διά', 'πᾶς', 'οὗτος', 'ὁ', 'πόθος', 'αὐτός', 'δηλόω', 'ὅς', 'ὥστε', 'παραγίγνομαι', 'πρός', 'αὐτός']
['ἐπεί', 'οὖν', 'οὐ', 'φθάνω', 'ἴσος', 'ἀπέρχομαι', 'καί', 'καταρτίζω', 'ὁ', 'ὑστέρημα', 'ὁ', 'πίστις', 'αὐτός', 'οὗτος', 'χάρις', 'ὁ', 'δεύτερος', 'προστίθημι']
['ὁ', 'ἐλλείπω', 'ἀπό', 'ὁ', 'παρουσία', 'διά', 'ὁ', 'πρᾶγμα']
['ὅτι', 'γάρ', 'οὐ', 'ἀπέρχομαι', 'εἰμί', 'στοχάζομαι']
['γράφω', 'γάρ', 'φημί', 'ἐρωτάω', 'δέ', 'ὑπέρ', 'ὁ', 'παρουσία', 'ὁ', 'ἐγώ', 'ἰησοῦς']
['καί', 'γάρ', 'ἐν', 'ὁ', 'πρότερος', 'στολή', 'λέγω', 'ὅτι', 'περί', 'δέ', 'ὁ', 'χρόνος', 'καί', 'καιρός', 'οὐ', 'χρεία', 'ἔχω', 'γράφω', 'ὥστε', 'εἰ', 'παραγίγνομαι', 'δέω', 'γράφω']


## Set hyperparameters

In [45]:
# Vector size (too small = underfit; too large = overfit)
size_input = 300 

# Window size (small = paradigmatic model; large = syntagmatic model)
window_input = 5 

# Minimum word count for inclusion in network
min_count_input = 2 # If a word occurs few times then its vector will not be very high quality

## Train model

In [46]:
print("Generating model . . . ")
start = time.time()
model = Word2Vec(sentences, vector_size=size_input, window=window_input, min_count=min_count_input, workers=4)
# Create a list of all the unique words in the corpus, in case user wants to query all words.
#     words_seen = set() # holds lines already seen
#     allWords = []
#     for line in sentences:
#         for word in line:
#             if word not in words_seen: # not a duplicate
#                 allWords.append(word)
#                 words_seen.add(word)   

print("\nModel initialized in {0} seconds.".format(time.time() - start))
print("\nTotal number of unique words in corpus: {0}.".format(len(model.wv)))

Generating model . . . 

Model initialized in 96.59753274917603 seconds.

Total number of unique words in corpus: 31754.


## Save model

In [47]:
model_name = 'nov2022'

model.save(f'../../models/{model_name}.model')