# Load and query pre-trained model

In order to query a pretrained model, you need to have a trained model in the `./models` folder.

## Import dependencies

In [1]:
import os
try:
    from collections.abc import Mapping
    from gensim.models.word2vec import Word2Vec
    from gensim.models import FastText
except:
#     print("Depencies not found. Make sure you have installed GenSim.")
    !pip install -Iv gensim
    from collections.abc import Mapping
    from gensim.models.word2vec import Word2Vec
    from gensim.models import FastText

## List available models

If you want to train a new model, use the 'build_greek_w2v_model.ipynb' or 'build_greek_fasttext_model.ipynb' scripts.

In [2]:
print('Available models:\n')
models = os.listdir('./models')
[print(f'{count}. "' + filename + '"\n') for count, filename in enumerate(models) if not(filename.startswith('.'))]

Available models:

0. "ft_papyri&corpus_cbow_hs_2_to_5_size300_window5_mincount2.model"

1. "ft_papyri&corpus_cbow_hs_2_to_5_size300_window5_mincount2.model.wv.vectors_ngrams.npy"

2. "papyri&corpus_skipgram_size300_window5_mincount2.model"

3. "dec2017.model"

4. "nov2022.model"

5. "nov2022_includes_papyri.model"

6. "sept2018.model"

8. "papyri&corpus_cbow_size300_window5_mincount2.model"



[None, None, None, None, None, None, None, None]

In [3]:
selected_model_number = 0

model_name = models[selected_model_number]

In [4]:
model = Word2Vec.load('./models/' + model_name)
model = model.wv

## Get most-similar hits for input lemma

In [7]:
test_lemmas = ['ἀρχιερεύς', 'ἄγγελος', 'πιλᾶτος', 'πέτρος', 'σῶμα', 'ψυχή', 'νέκρωσις', 'λύγξ', 'λόγος', 'φιλοσοφία']
print(test_lemmas[0])
model.most_similar(test_lemmas[0], topn=10)

ἀρχιερεύς


[('ἀρχιερατεύω', 0.6793726682662964),
 ('ἀρχιερωσύνη', 0.6621080040931702),
 ('περεύς', 0.659621000289917),
 ('ἱερεύς', 0.6541177034378052),
 ('ἀρχιερατικός', 0.6150493621826172),
 ('ἀρεύς', 0.5969539284706116),
 ('πρῳρεύς', 0.5262369513511658),
 ('κοπρεύς', 0.5242884159088135),
 ('ἀρχιτέλης', 0.509056806564331),
 ('ἀρχισωματοφύλαξ', 0.5088239908218384)]

## Compare tokenized sentences

In [54]:
word_set_1 = ['ἀρχιερεύς', 'ἄγγελος']
word_set_2 = ['πιλᾶτος', 'πέτρος']

model.n_similarity(word_set_1, word_set_2)

0.5299723