## Import dependencies

In [2]:
import os
import codecs
import time
try:
    from collections.abc import Mapping
    from gensim.models import FastText
except:
#     print("Depencies not found. Make sure you have installed GenSim.")
    !pip install -I gensim
#     !pip install -Iv gensim==3.2.0
    from collections.abc import Mapping
    from gensim.models import FastText

In [3]:
corpus_directories = [path for path in os.listdir('./data') if not(path.startswith('.'))]
print('Directories found in data folder:', corpus_directories)

Directories found in data folder: ['papyri', 'corpus']


## Locate corpus data

In [4]:
force_lowercase = True
use_lemma_disambiguation = False # Some lemmas are indicated with a numeric suffix (e.g., 'ὅτι2')

# Change to the directory entered (this is necessary to use the codecs.open() method). 
# TODO: rewrite this corpus iterator without the codecs module.

# if not(os.getcwd().split('/')[-1].endswith('corpus')):
#     os.chdir(corpus_directory)

# This class streams through the corpus when called.

def tokenize(string):
    output = string
    if use_lemma_disambiguation:
        pass
    else:
        # Filter numeric digits from token
        output = ''.join(filter(lambda x: not x.isdigit(), string))
    if force_lowercase:
        return [token.lower() for token in output.split()]
    else:
        return output.split()
    
class MySentences(object):
    def __iter__(self):
        for corpus_dir in corpus_directories: # the directories where the text files are.
            for file in os.listdir(f'./data/{corpus_dir}'): 
                if file.endswith(".txt"):
                    for line in codecs.open(f'./data/{corpus_dir}/{file}', 'r+'):
                        tokens = tokenize(line)
                        if len(tokens) > 0:
                            yield tokens

## Instantiate the corpus streamer

In [5]:
sentences = MySentences()

## Sanity check

In [6]:
# Output should resemble: ['ἐν', 'ὁ', 'πρότερος', 'ὅτι2', 'εὔχομαι', 'νύξ', 'καί', 'ἡμέρα', 'ὁράω', ...etc.
count = 0
for i in sentences:
    count += 1
    print(i)
    if count > 5:
        break

['κύριος', 'δίδυμος', 'χαίρω']
['οἶδα', 'ὅτι', 'ὁ', 'ὁ', 'ὁ', 'ἔτος', 'ἔτι', 'πέντε', 'ὁ', 'ἀπό', 'ὁ', 'λέγω', 'ὅτι', 'εἰ', 'μή', 'οὗτος', 'πέμπω', 'ὅπως', 'ἀπέρχομαι', 'εἰς', 'ὁ', 'καί', 'ποιέω', 'γίγνομαι', 'ἐπεί', 'γάρ', 'ἀσχολέω', 'πέμπω', 'ἐλάτης', 'ἐπεί', 'οὐδείς', 'ἐλαύνω']
['ἐπερωτάω', 'ὁμολογέω']
['λούκιος', 'εὐσεβής', 'καί', 'μάρκος', 'εὐσεβής', 'καί', 'πούπλιος']
['δημητρία', 'ὁ', 'καί', 'ἑρμιόνη', 'ἐμός', 'ὁ', 'υἱός']
['παρά', 'ἡρακλείδης']


## Set hyperparameters

In [7]:
# Vector size (too small = underfit; too large = overfit)
size_input = 300 

# Window size (small = paradigmatic model; large = syntagmatic model)
window_input = 5 

# Minimum word count for inclusion in network
min_count_input = 2 # If a word occurs few times then its vector will not be very high quality

# Use skip-gram (False means model will train with CBOW)
use_skip_gram = False

# Use hierarhical softmax
use_hs = True

# Use negative sampling (only possible if use_hs is False)
use_ns = False

# Minimum length of char n-grams model will use for training
min_char_ngram_len = 2

# Maximum length of char n-grams model will use for training
max_char_ngram_len = 5

# If max_char_ngram_len < min_char_ngram_len, then character n-grams will not be used in training

## Train model

In [9]:
print("Generating model . . . ")
start = time.time()
model = FastText(
    sentences, 
    vector_size=size_input, 
    window=window_input, 
    min_count=min_count_input, 
    sg=(use_skip_gram and 1 or 0), 
    hs=(use_hs and 1 or 0),
    negative=(not(use_hs) and use_ns and 1 or 0),
    workers=4,
    min_n=min_char_ngram_len,
    max_n=max_char_ngram_len
)

# print("Initializing model . . . ")
# model = FastText(vector_size=size_input, window=window_input, min_count=min_count_input)
# print("Building vocabulary . . . ")
# model.build_vocab(corpus_iterable=sentences)
# print("Defining examples . . . ")
# total_examples = model.corpus_count
# print("Training model . . . ")
# model.train(corpus_iterable=sentences, total_examples=total_examples, epochs=10)


# Create a list of all the unique words in the corpus, in case user wants to query all words.
#     words_seen = set() # holds lines already seen
#     allWords = []
#     for line in sentences:
#         for word in line:
#             if word not in words_seen: # not a duplicate
#                 allWords.append(word)
#                 words_seen.add(word)   

print("\nModel initialized in {0} seconds.".format(time.time() - start))
print("\nTotal number of unique words in corpus: {0}.".format(len(model.wv)))

Generating model . . . 

Model initialized in 248.383455991745 seconds.

Total number of unique words in corpus: 31903.


## Save model

In [11]:
def generate_model_name():
    model_name = 'ft_' + '&'.join(corpus_directories)
    if use_skip_gram:
        model_name += '_skipgram'
    else:
        model_name += '_cbow'
    if use_hs:
        model_name += '_hs'
    if use_ns:
        model_name += '_ns'
    model_name += f'_{min_char_ngram_len}_to_{max_char_ngram_len}'
    
    model_name += f'_size{size_input}_window{window_input}_mincount{min_count_input}'
    return model_name

model_name = generate_model_name()
model.save(f'./models/{model_name}.model')