In [598]:
### PREREQUISTIES
### (many used only in one notebook...)

import os
import pandas as pd
import numpy as np
import logging ### to monitor the code
from bs4 import BeautifulSoup
import pickle
import math
import random
import sys
import csv
import unicodedata
import requests
import re

import nltk
from nltk.collocations import *

import matplotlib.pyplot as plt
import seaborn as sns

from scipy import stats

import xml.etree.cElementTree as ET
from urllib.request import urlopen

import gspread
from gspread_dataframe import get_as_dataframe, set_with_dataframe
#from google.colab import auth
#from oauth2client.client import GoogleCredentials
from google.oauth2 import service_account # based on google-auth library
import sddk

In [599]:
# gensim parts
from gensim import corpora
from gensim import models

### lsa alternative
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.decomposition import TruncatedSVD 
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import euclidean_distances

In [600]:
#!pip install anda
#from anda import gr ### the import takes substantial time, since it import a +600MB file containing ancient Greek dictionary

In [601]:
### not neccessary for reading the data, just for exporting them to sciencedata.dk
conf = sddk.configure()

sciencedata.dk username (format '123456@au.dk'): 648597@au.dk
sciencedata.dk password: ········
endpoint variable has been configured to: https://sciencedata.dk/files/


In [152]:
# to access gsheet, you need Google Service Account key json file
# I have mine located in my personal space on sciencedata.dk, so I read it from there:

# (1) read the file and parse its content
file_data = conf[0].get(conf[1] + "ServiceAccountsKey.json").json()
# (2) transform the content into crendentials object
credentials = service_account.Credentials.from_service_account_info(file_data)
# (3) specify your usage of the credentials
scoped_credentials = credentials.with_scopes(['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive'])
# (4) use the constrained credentials for authentication of gspread package
gc = gspread.Client(auth=scoped_credentials)
# (5) establish connection with spreadsheets specified by their url
PIA_data = gc.open_by_url("https://docs.google.com/spreadsheets/d/1KxOx7Be9fj3lDcEPgQhQ-Iqcn9p367-MMD6RMXe8rks/edit?usp=sharing")
PIA_overview = gc.open_by_url("https://docs.google.com/spreadsheets/d/1e94wyelg6dftQ4zxbq1xvwxWAI-BhcYXtclDW-YTnrw/edit?usp=sharing")

# Import corpus

In [153]:
publicfolder = "31b393e2afe1ee96ce81869c7efe18cb"
c_hippocraticum = sddk.read_file("c_hippocraticum_enriched.json", "df", publicfolder)

reading file located in a public folder


In [154]:
def get_flat_sentences(series):
    sentences_list = [sent for doc in series.tolist() for sent in doc]
    return sentences_list

In [155]:
# docs = sentences
docs = get_flat_sentences(c_hippocraticum["lemmatized_sentences_repl"])

In [902]:
len(docs)

24456

In [904]:
morpheus_by_lemma["ἄλγημα"]

[{'d': 1341, 'l': 'ἄλγημα', 's': 'pain, suffering', 'a': None}]

In [156]:
print(docs[:10])

[['ὁπόσος', 'ἐπιχειρέω', 'ἰητρικῆς', 'λέγω', 'γράφω', 'ὑπόθεσις', 'αὐτός', 'ὑποθέμενοι', 'λόγος', 'θερμός', 'ψυχρός', 'ὑγρός', 'ξηρός', 'ἄλλος', 'ἐθέλω', 'βραχύς', 'ἄγω', 'ἀρχή', 'αἰτία', 'ἄνθρωπος', 'νοῦσος', 'θάνατος', 'πᾶς', 'αὐτός', 'εἷς', 'ὑποθέμενοι', 'πολύς', 'λέγω', 'καταφανής', 'εἰμί', 'ἁμαρτάνω', 'ἄξιος', 'μέμφομαι', 'τέχνη', 'εἰμί', 'χράω', 'πᾶς', 'μέγας', 'τιμάω', 'ἀγαθός', 'χειροτέχνης', 'δημιουργός'], ['δημιουργός', 'φαῦλος', 'πολύς', 'διαφέροντες'], ['εἰμί', 'ἰατρικός', 'αὐτός', 'ἔσκεπτο', 'εὑρίσκω', 'μηδείς', 'εἰμί', 'πᾶς', 'ἄπειρος', 'ἀνεπιστήμων', 'εἰμί', 'τύχη', 'κάμνω', 'διοικεῖτο'], ['ἔχω', 'ἄλλος', 'τέχνη', 'δημιουργός', 'πολύς', 'διαφέρω', 'χείρ', 'γνώμη', 'ἰητρικῆς'], ['ἀξιόω', 'κενός', 'ὑπόθεσις', 'δέω', 'ἀφανέα', 'ἀπορεόμενα', 'ἀνάγκη', 'ἐπιχειρέω', 'λέγω', 'ὑπόθεσις', 'χράω', 'οἷος', 'μετέωρος', 'γῆ'], ['τὶς', 'λέγω', 'γιγνώσκω', 'ἔχω', 'λέγω', 'ἀκούω', 'δῆλος', 'εἰμί', 'ἀληθής', 'εἰμί'], ['χρή', 'ἀνενέγκαντα', 'οἶδα', 'σαφής'], ['ἰατρικός', 'πᾶς', 'ὑπάρχω', 

In [157]:
c_hippocraticum["wordcount"].sum()

333443

In [158]:
c_hippocraticum["lemmata_repl"].apply(lambda x: len(x)).sum()

171332

In [159]:
# perhaps we will also explore our subcorpora at some point
subcorpora = {}
for cat in c_hippocraticum["work_cat_linka"].unique():
    subcorpora[cat] = get_flat_sentences(c_hippocraticum[c_hippocraticum["work_cat_linka"]==cat]["lemmatized_sentences_repl"])

for key in subcorpora.keys():
    print(key, len(subcorpora[key]))

Theoretical 2501
Other 16911
Practical 5044


In [507]:
#create gensim dictionary for our list of sentences
dictionary_sents = corpora.Dictionary(docs)

there is a number of methods to be applied upon a dictionary object:
see: https://radimrehurek.com/gensim/corpora/dictionary.html

First of all, you can inspect it as a standard dictionary object


In [161]:
# uncomment below to print the whole dictionary
# dictionary

In [508]:
# as such, it is organized by ids
dict(list(dictionary_sents.items())[:10])

{0: 'αἰτία',
 1: 'αὐτός',
 2: 'βραχύς',
 3: 'γράφω',
 4: 'δημιουργός',
 5: 'εἰμί',
 6: 'εἷς',
 7: 'θάνατος',
 8: 'θερμός',
 9: 'καταφανής'}

In [509]:
# but you can access it reveresly by applying token2id method
dict(list(dictionary_sents.token2id.items())[:10])

{'αἰτία': 0,
 'αὐτός': 1,
 'βραχύς': 2,
 'γράφω': 3,
 'δημιουργός': 4,
 'εἰμί': 5,
 'εἷς': 6,
 'θάνατος': 7,
 'θερμός': 8,
 'καταφανής': 9}

In [510]:
# collection_frequencies
dictionary_sents.cfs[7] # how many instances of word with id 7 (= "θάνατος")

81

In [511]:
dictionary_sents.dfs[7] # how many documents contain the word with id 7 (= "θάνατος")

81

In [512]:
dictionary_sents.num_pos # previously 167358

171332

In [513]:
len(dictionary_sents.keys())

25275

In [168]:
sent = docs[20]
sent

['ἀνάγκη',
 'ἰατρικός',
 'ποιέω',
 'ζητέω',
 'εὑρίσκω',
 'ἄνθρωπος',
 'κάμνω',
 'ταὐτός',
 'προσφερομένοισι',
 'ὑγιαίνοντες',
 'συμφέρω',
 'συμφέρω']

In [514]:
# doc2bow = document to (term, tf) tuples, i.e. bag-of-words
dictionary_sents.doc2bow(sent)

[(28, 1),
 (40, 1),
 (41, 1),
 (47, 1),
 (57, 1),
 (72, 1),
 (118, 1),
 (119, 2),
 (122, 1),
 (123, 1),
 (124, 1)]

In [515]:
# we can use our gensim dictionary

# as a corpus, we cannot use the Gensim default BoW model,

# we just need the words replaced by values

corpus_bow = [dictionary_sents.doc2bow(sent) for sent in docs]
corpus_idx = [dictionary_sents.doc2idx(sent) for sent in docs]

# LSA with  sklearn

In [745]:
# build gensim dictionary on the basis of whole works
lemmata = c_hippocraticum["lemmata_repl"].tolist()
dictionary = corpora.Dictionary(lemmata)
# words in at least 2 works
# with the number of sentences in which they appear
dictionary_tup = [(len([doc for doc in docs if dictionary[word_id] in doc]), dictionary[word_id]) for word_id in dictionary if dictionary.dfs[word_id] >= 2]
# n sorted words with highest frequency
n = 2000
vocabulary = [tup[1] for tup in sorted(dictionary_tup, reverse=True)[:n]]
doc_freq = [tup[0] for tup in sorted(dictionary_tup, reverse=True)[:n]]

In [746]:
sorted(dictionary_tup, reverse=True)[:10]

[(3315, 'εἰμί'),
 (2829, 'γίγνομαι'),
 (2351, 'οὗτος'),
 (2103, 'πολύς'),
 (1775, 'ἔχω'),
 (1224, 'ἄλλος'),
 (1169, 'σῶμα'),
 (1129, 'ποιέω'),
 (1030, 'χρή'),
 (1010, 'αὐτός')]

In [747]:
len(vocabulary)

2000

In [748]:
for word in key_words:
    if word in vocabulary:
        print(word)

λύπ*
ἄλγ*
ὀδύν*
πόνο*


In [749]:
vocabulary[:10]

['εἰμί',
 'γίγνομαι',
 'οὗτος',
 'πολύς',
 'ἔχω',
 'ἄλλος',
 'σῶμα',
 'ποιέω',
 'χρή',
 'αὐτός']

In [750]:
docs[0]

['ὁπόσος',
 'ἐπιχειρέω',
 'ἰητρικῆς',
 'λέγω',
 'γράφω',
 'ὑπόθεσις',
 'αὐτός',
 'ὑποθέμενοι',
 'λόγος',
 'θερμός',
 'ψυχρός',
 'ὑγρός',
 'ξηρός',
 'ἄλλος',
 'ἐθέλω',
 'βραχύς',
 'ἄγω',
 'ἀρχή',
 'αἰτία',
 'ἄνθρωπος',
 'νοῦσος',
 'θάνατος',
 'πᾶς',
 'αὐτός',
 'εἷς',
 'ὑποθέμενοι',
 'πολύς',
 'λέγω',
 'καταφανής',
 'εἰμί',
 'ἁμαρτάνω',
 'ἄξιος',
 'μέμφομαι',
 'τέχνη',
 'εἰμί',
 'χράω',
 'πᾶς',
 'μέγας',
 'τιμάω',
 'ἀγαθός',
 'χειροτέχνης',
 'δημιουργός']

In [758]:
# SIMPLE TEST
# (by default, countvectorizers does some undesirable preprocessing (e.g. removes *....))
test_corpus =  [['δῆμος*', 'Ἀθηναῖος','φαίνω','Ἕλλην'], ['δοκέω','οὗτος','τέχνη','ἐμός']]
vec_bow = CountVectorizer(lowercase=False, token_pattern=r"\w+\*?") #vocabulary=vocabulary, token_pattern=r".?\w+\*?") #, #strip_accents=None, encoding="utf-8") #, token_pattern=r"\*?\w+\*?")
bow = vec_bow.fit_transform([" ".join(sentence) for sentence in test_corpus]) ### run the model
vec_bow.get_feature_names()

['δοκέω', 'δῆμος*', 'οὗτος', 'τέχνη', 'φαίνω', 'Ἀθηναῖος', 'ἐμός', 'Ἕλλην']

In [759]:
vec_bow = CountVectorizer(lowercase=False, token_pattern=r"\w+\*?") #vocabulary=vocabulary, token_pattern=r".?\w+\*?") #, #strip_accents=None, encoding="utf-8") #, token_pattern=r"\*?\w+\*?")
bow = vec_bow.fit_transform([" ".join(sentence) for sentence in test_corpus]) ### run the model
#cooc  = bow.T * bow
#cooc_df = pd.DataFrame(cooc.todense(), columns=vec_bow.featurenames(), index=vec_bow.feature_names()) # / len(docs
vec_bow.get_feature_names()

['δοκέω', 'δῆμος*', 'οὗτος', 'τέχνη', 'φαίνω', 'Ἀθηναῖος', 'ἐμός', 'Ἕλλην']

In [760]:
vec_tfidf = TfidfVectorizer(vocabulary=vocabulary, lowercase=False, token_pattern=r"\w+\*?") ### initiaze the model
vec_bow = CountVectorizer(vocabulary=vocabulary, lowercase=False, token_pattern=r"\w+\*?") #vocabulary=vocabulary, token_pattern=r".?\w+\*?") #, #strip_accents=None, encoding="utf-8") #, token_pattern=r"\*?\w+\*?")

tfidf = vec_tfidf.fit_transform([" ".join(sentence) for sentence in docs]) ### run the model
bow = vec_bow.fit_transform([" ".join(sentence) for sentence in docs]) ### run the model

cooc  = bow.T * bow
cooc.setdiag(doc_freq) # word-word coocurrence matrix
cooc_df = pd.DataFrame(cooc.todense() / len(docs), columns=vocabulary, index=vocabulary) # / len(docs)
cooc_df


Changing the sparsity structure of a csc_matrix is expensive. lil_matrix is more efficient.



Unnamed: 0,εἰμί,γίγνομαι,οὗτος,πολύς,ἔχω,ἄλλος,σῶμα,ποιέω,χρή,αὐτός,...,ἀσθενείη,ἀπύρετος,ἀπόπατος,ἀπωθέω,ἀποτελευτάω,ἀποστέλλω,ἀποπέτομαι,ἀναΐσσει,ἀλφίτοισι,ἀλέα
εἰμί,0.135550,0.025515,0.022571,0.023266,0.018768,0.014230,0.014475,0.008505,0.010345,0.012512,...,0.000041,0.000368,0.000082,0.000041,0.000000,0.000082,0.000041,0.000082,0.000000,0.000123
γίγνομαι,0.025515,0.115677,0.020077,0.022448,0.012226,0.009773,0.013739,0.005275,0.003721,0.009446,...,0.000164,0.000000,0.000164,0.000000,0.000041,0.000041,0.000082,0.000082,0.000000,0.000123
οὗτος,0.022571,0.020077,0.096132,0.013494,0.010672,0.007565,0.005357,0.008301,0.005561,0.004702,...,0.000082,0.000041,0.000082,0.000041,0.000000,0.000000,0.000000,0.000041,0.000082,0.000000
πολύς,0.023266,0.022448,0.013494,0.085991,0.012512,0.009486,0.009650,0.006133,0.005357,0.005561,...,0.000041,0.000082,0.000000,0.000000,0.000000,0.000041,0.000082,0.000082,0.000041,0.000245
ἔχω,0.018768,0.012226,0.010672,0.012512,0.072579,0.008178,0.008137,0.004130,0.006093,0.005520,...,0.000123,0.000000,0.000082,0.000041,0.000000,0.000000,0.000041,0.000123,0.000041,0.000041
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ἀποστέλλω,0.000082,0.000041,0.000000,0.000041,0.000000,0.000041,0.000041,0.000082,0.000041,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000368,0.000000,0.000000,0.000000,0.000000
ἀποπέτομαι,0.000041,0.000082,0.000000,0.000082,0.000041,0.000000,0.000082,0.000000,0.000000,0.000041,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000368,0.000000,0.000000,0.000000
ἀναΐσσει,0.000082,0.000082,0.000041,0.000082,0.000123,0.000082,0.000041,0.000000,0.000000,0.000082,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000368,0.000000,0.000000
ἀλφίτοισι,0.000000,0.000000,0.000082,0.000041,0.000041,0.000041,0.000000,0.000041,0.000041,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000368,0.000000


In [841]:
pmi_rows_list = []
for column in cooc_df.columns:
    freq_a = cooc_df.at[column, column]
    pmi_row = []
    for term in cooc_df.index:
        freq_ab = cooc_df.at[column, term]
        if freq_ab > 0:
            freq_ab = freq_ab **3
            freq_b = cooc_df.at[term, term]
            pmi2 = math.log(freq_ab / (freq_a * freq_b))
        else:
            pmi2 = None
        pmi_row.append(pmi2)
    pmi_rows_list.append(pmi_row)

In [842]:
pmi_matrix = pd.DataFrame(pmi_rows_list, columns=vocabulary, index=vocabulary)

In [843]:
pmi_matrix_df = pd.DataFrame(pmi_matrix)
pmi_matrix_df

Unnamed: 0,εἰμί,γίγνομαι,οὗτος,πολύς,ἔχω,ἄλλος,σῶμα,ποιέω,χρή,αὐτός,...,ἀσθενείη,ἀπύρετος,ἀπόπατος,ἀπωθέω,ἀποτελευτάω,ἀποστέλλω,ἀποπέτομαι,ἀναΐσσει,ἀλφίτοισι,ἀλέα
εἰμί,-1.998418,-6.850071,-7.032796,-6.830323,-7.305249,-7.764116,-7.666857,-9.227317,-8.547989,-7.957794,...,-20.408068,-13.816395,-18.328627,-20.408068,,-18.328627,-20.408068,-18.328627,,-17.112231
γίγνομαι,-6.850071,-2.156952,-7.225573,-6.779134,-8.432534,-8.732798,-7.664880,-10.501960,-11.457045,-8.642762,...,-16.090651,,-16.090651,,-20.249534,-20.249534,-18.170092,-18.170092,,-16.953697
οὗτος,-7.032796,-7.225573,-2.342035,-8.121069,-8.655221,-9.316039,-10.305539,-8.956697,-10.066576,-10.550136,...,-17.985010,-20.064451,-17.985010,-20.064451,,,,-20.064451,-17.985010,
πολύς,-6.830323,-6.779134,-8.121069,-2.453511,-8.066551,-8.525418,-8.428160,-9.752933,-10.067473,-9.935492,...,-19.952976,-17.873534,,,,-19.952976,-17.873534,-17.873534,-19.952976,-14.577697
ἔχω,-7.305249,-8.432534,-8.655221,-8.066551,-2.623075,-8.801114,-8.770176,-10.769913,-9.511662,-9.788068,...,-16.487574,,-17.703970,-19.783411,,,-19.783411,-16.487574,-19.783411,-19.783411
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ἀποστέλλω,-18.328627,-20.249534,,-19.952976,,-19.411735,-19.365759,-17.251501,-19.239170,,...,,,,,,-7.907406,,,,
ἀποπέτομαι,-20.408068,-18.170092,,-17.873534,-19.783411,,-17.286318,,,-19.219561,...,,,,,,,-7.907406,,,
ἀναΐσσει,-18.328627,-18.170092,-20.064451,-17.873534,-16.487574,-17.332293,-19.365759,,,-17.140120,...,,,,,,,,-7.907406,,
ἀλφίτοισι,,,-17.985010,-19.952976,-19.783411,-19.411735,,-19.330943,-19.239170,,...,,,,,,,,,-7.907406,


our data are results of an algorithm to values lower than 1, what basicially implies negative numbers (see image below). Since this is not so well readible and interpretable, we will normalize these data to the scale from 


![](https://upload.wikimedia.org/wikipedia/commons/thumb/1/17/Binary_logarithm_plot_with_ticks.svg/600px-Binary_logarithm_plot_with_ticks.svg.png)

In [844]:
minval, maxval = pmi_matrix_df.min().min(), pmi_matrix_df.max().max()
print(minval, maxval)

-23.404167568348303 -1.9984179609507073


In [849]:
minval_2 = minval * 2
minval_2

-46.808335136696606

In [850]:
pmi_matrix_df.fillna(minval_2, inplace=True)
pmi_matrix_df

Unnamed: 0,εἰμί,γίγνομαι,οὗτος,πολύς,ἔχω,ἄλλος,σῶμα,ποιέω,χρή,αὐτός,...,ἀσθενείη,ἀπύρετος,ἀπόπατος,ἀπωθέω,ἀποτελευτάω,ἀποστέλλω,ἀποπέτομαι,ἀναΐσσει,ἀλφίτοισι,ἀλέα
εἰμί,-1.998418,-6.850071,-7.032796,-6.830323,-7.305249,-7.764116,-7.666857,-9.227317,-8.547989,-7.957794,...,-20.408068,-13.816395,-18.328627,-20.408068,-46.808335,-18.328627,-20.408068,-18.328627,-46.808335,-17.112231
γίγνομαι,-6.850071,-2.156952,-7.225573,-6.779134,-8.432534,-8.732798,-7.664880,-10.501960,-11.457045,-8.642762,...,-16.090651,-46.808335,-16.090651,-46.808335,-20.249534,-20.249534,-18.170092,-18.170092,-46.808335,-16.953697
οὗτος,-7.032796,-7.225573,-2.342035,-8.121069,-8.655221,-9.316039,-10.305539,-8.956697,-10.066576,-10.550136,...,-17.985010,-20.064451,-17.985010,-20.064451,-46.808335,-46.808335,-46.808335,-20.064451,-17.985010,-46.808335
πολύς,-6.830323,-6.779134,-8.121069,-2.453511,-8.066551,-8.525418,-8.428160,-9.752933,-10.067473,-9.935492,...,-19.952976,-17.873534,-46.808335,-46.808335,-46.808335,-19.952976,-17.873534,-17.873534,-19.952976,-14.577697
ἔχω,-7.305249,-8.432534,-8.655221,-8.066551,-2.623075,-8.801114,-8.770176,-10.769913,-9.511662,-9.788068,...,-16.487574,-46.808335,-17.703970,-19.783411,-46.808335,-46.808335,-19.783411,-16.487574,-19.783411,-19.783411
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ἀποστέλλω,-18.328627,-20.249534,-46.808335,-19.952976,-46.808335,-19.411735,-19.365759,-17.251501,-19.239170,-46.808335,...,-46.808335,-46.808335,-46.808335,-46.808335,-46.808335,-7.907406,-46.808335,-46.808335,-46.808335,-46.808335
ἀποπέτομαι,-20.408068,-18.170092,-46.808335,-17.873534,-19.783411,-46.808335,-17.286318,-46.808335,-46.808335,-19.219561,...,-46.808335,-46.808335,-46.808335,-46.808335,-46.808335,-46.808335,-7.907406,-46.808335,-46.808335,-46.808335
ἀναΐσσει,-18.328627,-18.170092,-20.064451,-17.873534,-16.487574,-17.332293,-19.365759,-46.808335,-46.808335,-17.140120,...,-46.808335,-46.808335,-46.808335,-46.808335,-46.808335,-46.808335,-46.808335,-7.907406,-46.808335,-46.808335
ἀλφίτοισι,-46.808335,-46.808335,-17.985010,-19.952976,-19.783411,-19.411735,-46.808335,-19.330943,-19.239170,-46.808335,...,-46.808335,-46.808335,-46.808335,-46.808335,-46.808335,-46.808335,-46.808335,-46.808335,-7.907406,-46.808335


In [857]:
pmi_matrix_norm_df = (pmi_matrix_df - minval_2) / (maxval - minval_2)
pmi_matrix_norm_df

Unnamed: 0,εἰμί,γίγνομαι,οὗτος,πολύς,ἔχω,ἄλλος,σῶμα,ποιέω,χρή,αὐτός,...,ἀσθενείη,ἀπύρετος,ἀπόπατος,ἀπωθέω,ἀποτελευτάω,ἀποστέλλω,ἀποπέτομαι,ἀναΐσσει,ἀλφίτοισι,ἀλέα
εἰμί,1.000000,0.891728,0.887650,0.892169,0.881570,0.871330,0.873500,0.838676,0.853837,0.867008,...,0.589161,0.736264,0.635567,0.589161,0.000000,0.635567,0.589161,0.635567,0.000000,0.662713
γίγνομαι,0.891728,0.996462,0.883348,0.893311,0.856413,0.849712,0.873544,0.810231,0.788917,0.851722,...,0.685511,0.000000,0.685511,0.000000,0.592699,0.592699,0.639105,0.639105,0.000000,0.666251
οὗτος,0.887650,0.883348,0.992332,0.863364,0.851444,0.836696,0.814614,0.844716,0.819947,0.809156,...,0.643235,0.596830,0.643235,0.596830,0.000000,0.000000,0.000000,0.596830,0.643235,0.000000
πολύς,0.892169,0.893311,0.863364,0.989844,0.864581,0.854340,0.856511,0.826946,0.819927,0.822872,...,0.599317,0.645723,0.000000,0.000000,0.000000,0.599317,0.645723,0.645723,0.599317,0.719275
ἔχω,0.881570,0.856413,0.851444,0.864581,0.986060,0.848188,0.848878,0.804251,0.832331,0.826162,...,0.676653,0.000000,0.649507,0.603101,0.000000,0.000000,0.603101,0.676653,0.603101,0.603101
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ἀποστέλλω,0.635567,0.592699,0.000000,0.599317,0.000000,0.611396,0.612422,0.659605,0.615247,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.868132,0.000000,0.000000,0.000000,0.000000
ἀποπέτομαι,0.589161,0.639105,0.000000,0.645723,0.603101,0.000000,0.658828,0.000000,0.000000,0.615685,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.868132,0.000000,0.000000,0.000000
ἀναΐσσει,0.635567,0.639105,0.596830,0.645723,0.676653,0.657802,0.612422,0.000000,0.000000,0.662090,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.868132,0.000000,0.000000
ἀλφίτοισι,0.000000,0.000000,0.643235,0.599317,0.603101,0.611396,0.000000,0.613199,0.615247,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.868132,0.000000


In [858]:
key_words

['λύπ*', 'ἄλγ*', 'ὀδύν*', 'πόνο*']

In [859]:
keyterms_pmi_20 = []
for keyterm in key_words:
    keyterm_pmi_20 = pmi_matrix_norm_df.sort_values(keyterm, ascending=False).index[1:21].tolist()
    keyterms_pmi_20.append(keyterm_pmi_20)   #, ascending=False) # ["πόνο*"][:10]

In [860]:
pd.DataFrame(keyterms_pmi_20, index=key_words).T

Unnamed: 0,λύπ*,ἄλγ*,ὀδύν*,πόνο*
0,εἰμί,ὀσφῦς,ἔχω,σῶμα
1,προσήκω,κεφαλή,πυρετός,γίγνομαι
2,ἄνθρωπος,πυρετός,γίγνομαι,πολύς
3,συμβαίνω,ὀδύν*,γαστήρ,εἰμί
4,αὐτός,γαστήρ,ἴσχω,ἄλλος
5,σῶμα,ἔχω,ἰξύς,πυρετός
6,γίγνομαι,εἰμί,κενεών,ἔχω
7,οὐδείς,κοιλία,νειαίρην,κοιλία
8,αἰσθάνομαι,γίγνομαι,κεφαλή,χρόνος
9,οὗτος,ἰξύς,παρέχω,σιτίον


# Tructated SVD


In [861]:
svd = TruncatedSVD(n_components=250)
pmi_svd = svd.fit_transform(pmi_matrix_norm_df)

In [862]:
pmi_svd_df = pd.DataFrame(pmi_svd, index=vocabulary)
pmi_svd_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,240,241,242,243,244,245,246,247,248,249
εἰμί,24.206335,-0.623011,13.711127,-1.085478,0.521489,-0.808039,-0.792633,1.147835,-0.718566,-1.827209,...,0.191993,-0.406361,-0.270955,-0.538398,0.477554,0.475606,0.224948,-0.408213,0.017722,-0.207916
γίγνομαι,22.749487,-2.506090,12.249666,0.132267,-0.357439,-0.604326,-1.433378,1.259412,-0.605295,-1.433997,...,-0.007014,-0.061761,0.863464,0.293526,0.371569,-0.211660,-0.140615,0.154374,0.082667,-0.048704
οὗτος,22.188726,0.111578,11.672230,-1.878866,0.421162,-1.057038,-0.697253,1.166066,-1.396677,-0.736400,...,-0.445178,0.092027,0.026887,0.095801,-0.422831,0.384229,-0.055173,-0.026027,0.600475,0.388602
πολύς,22.531469,-1.104643,11.947511,-0.678379,-0.748559,-1.649300,-0.499410,0.728468,-0.510823,-2.030171,...,-0.372433,-0.790007,-0.382994,-0.463877,-0.272769,0.233369,0.263872,-0.113881,-0.520071,-0.440499
ἔχω,22.073696,-0.654236,11.272326,-0.085194,0.261290,-0.842073,0.027077,0.473260,-1.266899,-1.718608,...,1.356260,0.141811,-0.096830,-0.252734,0.845168,-0.007619,0.970370,0.187742,-0.329936,-0.565447
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ἀποστέλλω,1.328787,-0.418548,-0.778039,-1.001773,0.214534,0.047228,-0.566946,0.117675,-0.570151,-0.214810,...,0.142297,0.187781,0.007818,0.128654,0.117391,0.093451,0.010784,0.087570,0.020767,-0.116506
ἀποπέτομαι,1.965387,-0.630608,-0.728538,0.566804,-0.636986,-1.279257,-0.465432,0.720947,0.224249,-0.755238,...,0.298037,0.181855,-0.017331,-0.389966,-0.092453,-0.019465,-0.051285,-0.096971,0.461871,0.118464
ἀναΐσσει,1.734720,-0.650770,-1.141345,0.490227,0.153035,0.072421,0.039919,0.090705,-0.462771,-0.464262,...,-0.086191,-0.228276,0.125968,0.302755,-0.049901,-0.057261,0.192360,-0.247754,-0.011897,-0.075831
ἀλφίτοισι,1.784169,1.870863,-0.507044,-0.255334,-0.468553,-0.209822,0.377055,0.277802,-1.229686,0.680655,...,0.016866,-0.132507,0.279042,0.131936,-0.058323,0.009187,-0.073749,-0.145255,-0.046873,-0.136521


In [863]:
pmi_svd_cos = pd.DataFrame(cosine_similarity(pmi_svd_df.to_numpy()).round(5), columns=vocabulary, index=vocabulary)
pmi_svd_cos

Unnamed: 0,εἰμί,γίγνομαι,οὗτος,πολύς,ἔχω,ἄλλος,σῶμα,ποιέω,χρή,αὐτός,...,ἀσθενείη,ἀπύρετος,ἀπόπατος,ἀπωθέω,ἀποτελευτάω,ἀποστέλλω,ἀποπέτομαι,ἀναΐσσει,ἀλφίτοισι,ἀλέα
εἰμί,1.00000,0.97144,0.96643,0.97203,0.96137,0.90849,0.87705,0.87916,0.83575,0.88330,...,0.27709,0.26308,0.25262,0.26928,0.21630,0.23618,0.28455,0.27598,0.22954,0.28641
γίγνομαι,0.97144,1.00000,0.94394,0.95503,0.93964,0.89881,0.88173,0.83585,0.79693,0.86498,...,0.29550,0.24888,0.25906,0.26298,0.24100,0.23239,0.30419,0.27359,0.20915,0.30021
οὗτος,0.96643,0.94394,1.00000,0.94621,0.93765,0.90039,0.85832,0.86474,0.84007,0.87265,...,0.27959,0.27554,0.24510,0.27472,0.22582,0.22681,0.27461,0.26958,0.27380,0.27990
πολύς,0.97203,0.95503,0.94621,1.00000,0.94398,0.89764,0.86900,0.85173,0.81106,0.85698,...,0.30105,0.27862,0.25756,0.25502,0.24133,0.24587,0.28279,0.28432,0.24893,0.31110
ἔχω,0.96137,0.93964,0.93765,0.94398,1.00000,0.89682,0.86983,0.85647,0.83632,0.86039,...,0.31202,0.27406,0.28047,0.28874,0.21321,0.22740,0.30096,0.28423,0.26918,0.30997
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ἀποστέλλω,0.23618,0.23239,0.22681,0.24587,0.22740,0.27093,0.24391,0.23783,0.23578,0.23393,...,0.16406,0.28879,0.15974,0.20578,0.05626,1.00000,0.12167,0.24360,0.12355,0.23181
ἀποπέτομαι,0.28455,0.30419,0.27461,0.28279,0.30096,0.27974,0.31678,0.28323,0.25903,0.29696,...,0.29944,0.20017,0.17778,0.10708,0.42228,0.12167,1.00000,0.32988,0.07921,0.23796
ἀναΐσσει,0.27598,0.27359,0.26958,0.28432,0.28423,0.30301,0.31499,0.26957,0.24596,0.30731,...,0.37578,0.31227,0.22081,0.28287,0.24301,0.24360,0.32988,1.00000,0.10314,0.22914
ἀλφίτοισι,0.22954,0.20915,0.27380,0.24893,0.26918,0.26654,0.22064,0.29527,0.27243,0.24186,...,0.13298,0.23656,0.13942,0.15118,0.06315,0.12355,0.07921,0.10314,1.00000,0.15052


In [868]:
pmi_svd_cos_pain = {}
for key_word in key_words: 
    ordered_df = pmi_svd_cos.sort_values(key_word, ascending=False)
    pmi_svd_cos_pain[key_word + " word"] = ordered_df.index.tolist()[1:21]
    pmi_svd_cos_pain[key_word + " cos.sim."] = ordered_df[key_word].tolist()[1:21]

In [873]:
pmi_svd_cos_pain_df = pd.DataFrame(pmi_svd_cos_pain)
pmi_svd_cos_pain_df

Unnamed: 0,λύπ* word,λύπ* cos.sim.,ἄλγ* word,ἄλγ* cos.sim.,ὀδύν* word,ὀδύν* cos.sim.,πόνο* word,πόνο* cos.sim.
0,χροιά,0.61357,ὀδύν*,0.75326,πυρετός,0.83606,γίγνομαι,0.80543
1,ἐρωτάω,0.60936,πυρετός,0.75094,ἔχω,0.81416,πολύς,0.80538
2,συμβαίνω,0.60119,κεφαλή,0.71939,γίγνομαι,0.80057,κοιλία,0.80443
3,ἐξέρχομαι,0.59676,πόνο*,0.7191,πολύς,0.79099,πυρετός,0.79858
4,ὁράω,0.58893,ῥῖγος,0.71247,εἰμί,0.78615,σῶμα,0.79407
5,ὅρος,0.58427,ἔχω,0.70959,οὗτος,0.77884,εἰμί,0.78831
6,ψαύω,0.58416,ὀξύς,0.70138,κεφαλή,0.77709,ἔχω,0.78733
7,σπλάγχνον,0.57286,ὀφθαλμός,0.70105,κοιλία,0.76832,ἄλλος,0.76904
8,ἀποκρίνω,0.57139,γαστήρ,0.69824,γαστήρ,0.76254,οὗτος,0.75928
9,πρόσωπον,0.56976,γίγνομαι,0.69028,αἷμα,0.75745,χρόνος,0.75585


In [884]:
# set_with_dataframe(PIA_overview.add_worksheet("pmi_svd_cos_pain", 1, 1), pmi_svd_cos_pain_df)

In [876]:
words = []
for col in pmi_svd_cos_pain_df.columns:
    if "word" in col:
        words.extend(pmi_svd_cos_pain_df[col].tolist())
words_unique = list(set(words))
print(len(words_unique))
print(words_unique)

52
['οὗτος', 'ἄλγ*', 'ἀποκρίνω', 'αὐτός', 'δοκέω', 'ἡμέρα', 'ἰσχυρός', 'Κοίλη', 'πυρετός', 'βληχρός', 'χρόνος', 'πᾶς', 'ἐπάγω', 'ῥῖγος', 'σπλάγχνον', 'κοιλία', 'οἷος', 'εἰμί', 'κεφαλή', 'πόνο*', 'ἐξέρχομαι', 'οὐρέει', 'ἄλλος', 'πότος', 'ὁποῖος', 'συμβαίνω', 'ἐπιλαμβάνω', 'ὀσφῦς', 'μέγας', 'ἴσχω', 'σκληρός', 'αἷμα', 'χροιά', 'ὀξύς', 'ὑγιαίνω', 'ὀφθαλμός', 'ὁράω', 'ὀλίγος', 'ὅρος', 'νόσος', 'γαστήρ', 'γονή', 'λεπτός', 'πρόσωπον', 'ψαύω', 'γίγνομαι', 'σῶμα', 'πολύς', 'ὀδύν*', 'ἔχω', 'φανερός', 'ἐρωτάω']


In [879]:
terms_by_cat_df = get_as_dataframe(PIA_overview.worksheet("terms_by_cat"))
terms_by_cat_dict = dict(zip(terms_by_cat_df["term"], terms_by_cat_df["term_category"]))
coded_words = list(terms_by_cat_dict.keys())
print(coded_words)

['πρότερος', 'ψυχρός', 'λαμβάνω', 'κεφαλή', 'ἰσχυρός', 'ἄνθρωπος', 'χολή', 'ὁπόσος', 'ἄλγ*', 'τρίτος', 'πᾶς', 'πλείων', 'σάρξ', 'συμβαίνω', 'θερμός', 'μαλλός', 'ἡμέρα', 'πόνο*', 'ὀδύν*', 'κοιλία', 'εἶμι', 'ποιέω', 'γίγνομαι', 'ἔχω', 'καθίημι', 'δοκέω', 'δίδωμι', 'ὀξύς', 'πολύς', 'μήτρα', 'ἰξύς', 'κενεών', 'χρόνος', 'σιτίον', 'ἴσχω', 'ἐσθίω', 'νόσος', 'σῶμα', 'ὀσφύς', 'ὀλίγος', 'παύω', 'γαστήρ', 'αἷμα', 'πυρετός', 'ὀφθαλμός', 'οὗτος', 'ἄλλος', 'αὐτός', 'εἰμί', 'λύπ*', 'ὀσφῦς', 'παρέχω', 'οὐδείς', 'προσήκω', 'πλευροῦ', 'τράχηλος', 'ῥῖγος', 'ἐξέρχομαι', 'βήξ', 'σῖτος', 'στῆθος', 'μεμιγμένα', 'ἀειδής', 'νειαίρην', 'ἐπίνοσον']


In [882]:
words_uncoded = [word for word in words_unique if word not in coded_words]
words_uncoded

['ἀποκρίνω',
 'Κοίλη',
 'βληχρός',
 'ἐπάγω',
 'σπλάγχνον',
 'οἷος',
 'οὐρέει',
 'πότος',
 'ὁποῖος',
 'ἐπιλαμβάνω',
 'μέγας',
 'σκληρός',
 'χροιά',
 'ὑγιαίνω',
 'ὁράω',
 'ὅρος',
 'γονή',
 'λεπτός',
 'πρόσωπον',
 'ψαύω',
 'φανερός',
 'ἐρωτάω']

In [883]:
set_with_dataframe(PIA_overview.add_worksheet("words_tmp", 1, 1), pd.DataFrame(words_uncoded))

# Backup: Previous experiments with LSA

In [892]:
svd = TruncatedSVD(n_components=250)
lsa = svd.fit_transform(tfidf)
lsa_doc = pd.DataFrame(lsa) #, columns=vocabulary)
lsa_doc

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,240,241,242,243,244,245,246,247,248,249
0,0.291092,0.120812,-0.011414,-0.045221,0.087048,-0.015248,0.134973,0.037312,0.040419,-0.008096,...,-0.023577,0.023873,0.017753,-0.026195,0.012706,0.029686,-0.015625,-0.034971,-0.007640,-0.007286
1,0.113945,-0.048341,0.013215,-0.068549,0.199092,0.277817,0.038364,-0.080837,-0.007310,-0.050582,...,0.005202,0.005137,0.004956,0.000223,0.006238,-0.005648,-0.003812,-0.004576,0.005733,-0.003043
2,0.278826,0.343216,-0.054982,-0.059967,-0.113581,0.003020,0.013958,0.001845,0.020055,-0.038230,...,-0.006006,-0.028921,-0.005711,0.055508,0.012969,-0.002231,-0.004746,-0.062788,0.018479,-0.036495
3,0.178850,-0.017742,0.019273,-0.042593,0.297982,-0.070750,0.074128,-0.163608,-0.043452,-0.087134,...,-0.053331,-0.026377,-0.021582,-0.020280,-0.024655,-0.033270,0.029185,0.019736,-0.016080,0.044814
4,0.066580,-0.000201,-0.002682,0.012326,0.013823,-0.011123,0.012705,0.024484,0.029276,-0.004064,...,-0.000818,0.008670,-0.014765,-0.025303,-0.008954,0.000089,0.012837,-0.018779,0.017352,-0.015500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24451,0.075155,-0.018564,-0.005475,0.073267,0.005832,-0.018919,0.031640,-0.021884,0.068928,-0.067545,...,0.002607,-0.001323,0.003396,0.002406,0.003437,0.003704,0.000441,0.001182,0.002269,-0.001387
24452,0.296445,0.058825,-0.050300,0.135022,0.033343,0.169435,0.042056,0.001736,0.115289,-0.071581,...,-0.002363,0.007474,0.010698,-0.012415,0.003211,0.005329,0.006494,0.007056,-0.009217,-0.000316
24453,0.065155,0.040187,0.399040,0.056255,-0.012848,-0.014455,0.004877,-0.012601,0.004090,-0.011182,...,-0.016274,0.000194,-0.001512,-0.031107,-0.004968,-0.036236,0.006979,-0.010521,-0.037103,-0.038958
24454,0.158804,0.026910,-0.011339,-0.081714,-0.097286,-0.011491,-0.005095,-0.019178,0.002196,-0.016735,...,-0.005564,0.018009,0.014392,0.023937,0.042614,-0.010459,-0.002001,-0.021583,0.007948,0.015325


In [893]:
lsa_word = pd.DataFrame(svd.components_, columns=vocabulary).T #, columns=vocabulary)
lsa_word

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,240,241,242,243,244,245,246,247,248,249
εἰμί,0.481994,0.757178,-0.126295,-0.137315,-0.309753,0.076535,-0.036132,-0.094879,-0.061312,-0.068219,...,0.000771,-0.004147,-0.002766,0.005831,0.001313,0.001150,-0.002089,0.001149,-0.002744,0.000613
γίγνομαι,0.385735,-0.563874,0.033859,-0.456447,-0.465803,-0.095518,-0.077631,-0.051129,-0.078033,-0.144964,...,-0.001372,-0.001277,-0.008644,0.005807,-0.002493,-0.005655,0.002305,-0.002098,-0.001804,0.000854
οὗτος,0.386984,-0.237714,-0.155228,0.810529,-0.182356,0.086513,0.034810,-0.142527,-0.120311,0.049164,...,-0.006240,0.000887,-0.001492,0.000447,-0.000935,-0.000493,-0.002332,-0.004415,-0.004133,0.001817
πολύς,0.285897,-0.126532,0.034322,-0.172830,0.507344,0.706528,0.095685,-0.206553,-0.021490,-0.126551,...,0.002339,0.002850,-0.002082,0.001739,0.004876,-0.000140,0.003147,0.004958,0.005255,-0.001397
ἔχω,0.207923,-0.000684,-0.004842,0.029985,0.466058,-0.461504,-0.606931,-0.298151,-0.038327,-0.152027,...,-0.001579,0.001877,-0.003560,0.002272,-0.001240,0.000518,-0.001716,0.001732,0.003596,0.002983
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ἀποστέλλω,0.000777,0.000133,-0.000011,0.000077,0.000478,-0.000402,0.000665,0.001117,0.002575,0.000311,...,0.000616,0.000805,-0.000060,0.000608,0.001988,0.000368,-0.002334,0.000241,-0.000358,0.000262
ἀποπέτομαι,0.000847,-0.000470,0.000101,-0.000828,0.000885,0.000547,-0.000191,0.000175,0.000523,0.002245,...,0.000160,0.001253,-0.001956,0.000276,-0.001185,0.000101,0.001000,-0.001763,-0.000187,-0.003585
ἀναΐσσει,0.000931,-0.000348,-0.000052,-0.000276,0.000590,-0.000395,-0.000286,-0.000444,0.000062,0.000790,...,-0.000180,0.002646,0.002093,0.000561,-0.000915,0.002104,-0.000686,-0.000118,-0.003367,0.000138
ἀλφίτοισι,0.000256,-0.000042,-0.000006,0.000152,0.000426,-0.000134,-0.000013,0.001373,-0.001122,-0.000171,...,-0.006492,-0.001816,0.000753,-0.000192,0.000098,0.000079,0.002321,-0.000988,-0.002716,0.004547


In [894]:
lsa_word_cos = pd.DataFrame(cosine_similarity(lsa_word.to_numpy()).round(5), columns=vocabulary, index=vocabulary)
lsa_word_cos

Unnamed: 0,εἰμί,γίγνομαι,οὗτος,πολύς,ἔχω,ἄλλος,σῶμα,ποιέω,χρή,αὐτός,...,ἀσθενείη,ἀπύρετος,ἀπόπατος,ἀπωθέω,ἀποτελευτάω,ἀποστέλλω,ἀποπέτομαι,ἀναΐσσει,ἀλφίτοισι,ἀλέα
εἰμί,1.00000,-0.00104,-0.00017,-0.00001,-0.00038,-0.00007,-0.00040,-0.00047,-0.00001,-0.00024,...,-0.01677,0.12176,-0.00279,-0.01038,-0.00766,-0.02031,-0.00872,-0.00344,-0.00716,-0.00451
γίγνομαι,-0.00104,1.00000,0.00009,-0.00037,-0.00082,-0.00029,0.00015,-0.00067,-0.00011,-0.00033,...,0.01665,-0.00345,0.06058,-0.00745,-0.00191,-0.00493,0.00815,0.01265,-0.00159,0.01221
οὗτος,-0.00017,0.00009,1.00000,-0.00015,0.00010,-0.00011,-0.00036,0.00030,0.00013,-0.00041,...,0.05083,-0.01183,0.05480,-0.00279,-0.00102,-0.02526,-0.01083,0.01191,0.00364,-0.02468
πολύς,-0.00001,-0.00037,-0.00015,1.00000,-0.00006,-0.00025,0.00008,-0.00024,0.00030,-0.00023,...,-0.02689,0.00808,-0.01440,-0.00393,-0.02168,-0.00753,0.01969,0.01149,-0.00295,0.05481
ἔχω,-0.00038,-0.00082,0.00010,-0.00006,1.00000,-0.00016,-0.00014,-0.00026,0.00014,-0.00011,...,0.02605,-0.02282,-0.01585,-0.00636,-0.01980,-0.01552,-0.00505,0.02694,-0.00575,-0.00837
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ἀποστέλλω,-0.02031,-0.00493,-0.02526,-0.00753,-0.01552,-0.01677,0.10073,0.10660,-0.00582,-0.03260,...,0.04649,-0.05125,-0.03915,-0.01910,-0.01652,1.00000,0.03234,0.02619,-0.07021,-0.02607
ἀποπέτομαι,-0.00872,0.00815,-0.01083,0.01969,-0.00505,-0.01221,0.05677,-0.01239,-0.00451,0.02542,...,0.02873,-0.06885,0.03287,-0.03588,0.17936,0.03234,1.00000,-0.02274,-0.04724,-0.03297
ἀναΐσσει,-0.00344,0.01265,0.01191,0.01149,0.02694,0.01458,0.01206,-0.01116,-0.01048,0.02127,...,0.01167,-0.09959,-0.07131,-0.00159,0.07013,0.02619,-0.02274,1.00000,0.00697,-0.05355
ἀλφίτοισι,-0.00716,-0.00159,0.00364,-0.00295,-0.00575,-0.00255,-0.00138,-0.01112,-0.00518,-0.00910,...,-0.07956,0.05779,-0.04862,-0.03991,-0.00792,-0.07021,-0.04724,0.00697,1.00000,0.03752


In [895]:
lsa_word_cos.sort_values("αἰτία", ascending=False)["αἰτία"][1:11]

ἐπιφέρω        0.39020
ἄδηλος         0.39018
αἴτιος         0.38436
ἀπόκρισις      0.38083
ὅδε            0.37467
πάθος          0.36896
θεός           0.35816
προειρημένα    0.34045
ἔργον          0.32858
καλός          0.32827
Name: αἰτία, dtype: float64

In [896]:
lsa_word_cos.sort_values("ὀδύν*", ascending=False)["ὀδύν*"][1:11]

ἀπαλλάσσηται    0.44013
χλιαίνειν       0.43666
πλευρίτιδι      0.42847
ὀξείη           0.37870
βρέγμα          0.33909
πλευροῦ         0.33297
νάρκη           0.30313
χλιάσματα       0.26793
φοιτάω          0.25847
ἄχθομαι         0.25395
Name: ὀδύν*, dtype: float64

In [897]:
lsa_word_cos.sort_values("πόνο*", ascending=False)["πόνο*"][1:11]

δύσπνοια       0.30235
ἀποπέτομαι     0.26514
γυμνάζω        0.21987
περισσῇσιν     0.21487
κουφίζω        0.21327
ὑποχονδρίῳ     0.20942
εἴλω           0.20890
ἀποτελευτάω    0.20226
σύμμετρος      0.19750
ἀφωνίαι        0.19118
Name: πόνο*, dtype: float64

In [898]:
def get_most_similar(model_df, target_term, number):
    all_similar = []
    for term in model_df.columns:
        similarity = (term, cosine_similarity([model_df[target_term],  model_df[term]])[0][1])
        all_similar.append(similarity)
    return sorted(all_similar, key=lambda number: number[1], reverse=True)[1:number]

In [899]:
key_words = ['λύπ*', 'ἄλγ*', 'ὀδύν*', 'πόνο*']

In [900]:
for word in key_words:
    if word in lsa_word_cos.index:
        print(word)

λύπ*
ἄλγ*
ὀδύν*
πόνο*
