In [148]:
### PREREQUISTIES
### (many used only in one notebook...)

import os
import pandas as pd
import numpy as np
import logging ### to monitor the code
from bs4 import BeautifulSoup
import pickle
import math
import random
import sys
import csv
import unicodedata
import requests
import re

import nltk
from nltk.collocations import *

import matplotlib.pyplot as plt
import seaborn as sns

from scipy import stats

import xml.etree.cElementTree as ET
from urllib.request import urlopen

import gspread
from gspread_dataframe import get_as_dataframe, set_with_dataframe
#from google.colab import auth
#from oauth2client.client import GoogleCredentials
from google.oauth2 import service_account # based on google-auth library
import sddk

In [149]:
# gensim parts
from gensim import corpora
from gensim import models

### lsa alternative
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.decomposition import TruncatedSVD 
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import euclidean_distances

In [150]:
#!pip install anda
#from anda import gr ### the import takes substantial time, since it import a +600MB file containing ancient Greek dictionary

In [151]:
### not neccessary for reading the data, just for exporting them to sciencedata.dk
conf = sddk.configure()

sciencedata.dk username (format '123456@au.dk'): 648597@au.dk
sciencedata.dk password: ········
endpoint variable has been configured to: https://sciencedata.dk/files/


In [152]:
# to access gsheet, you need Google Service Account key json file
# I have mine located in my personal space on sciencedata.dk, so I read it from there:

# (1) read the file and parse its content
file_data = conf[0].get(conf[1] + "ServiceAccountsKey.json").json()
# (2) transform the content into crendentials object
credentials = service_account.Credentials.from_service_account_info(file_data)
# (3) specify your usage of the credentials
scoped_credentials = credentials.with_scopes(['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive'])
# (4) use the constrained credentials for authentication of gspread package
gc = gspread.Client(auth=scoped_credentials)
# (5) establish connection with spreadsheets specified by their url
PIA_data = gc.open_by_url("https://docs.google.com/spreadsheets/d/1KxOx7Be9fj3lDcEPgQhQ-Iqcn9p367-MMD6RMXe8rks/edit?usp=sharing")
PIA_overview = gc.open_by_url("https://docs.google.com/spreadsheets/d/1e94wyelg6dftQ4zxbq1xvwxWAI-BhcYXtclDW-YTnrw/edit?usp=sharing")

# Import corpus

In [153]:
publicfolder = "31b393e2afe1ee96ce81869c7efe18cb"
c_hippocraticum = sddk.read_file("c_hippocraticum_enriched.json", "df", publicfolder)

reading file located in a public folder


In [154]:
def get_flat_sentences(series):
    sentences_list = [sent for doc in series.tolist() for sent in doc]
    return sentences_list

In [155]:
# docs = sentences
docs = get_flat_sentences(c_hippocraticum["lemmatized_sentences_repl"])

In [156]:
print(docs[:10])

[['ὁπόσος', 'ἐπιχειρέω', 'ἰητρικῆς', 'λέγω', 'γράφω', 'ὑπόθεσις', 'αὐτός', 'ὑποθέμενοι', 'λόγος', 'θερμός', 'ψυχρός', 'ὑγρός', 'ξηρός', 'ἄλλος', 'ἐθέλω', 'βραχύς', 'ἄγω', 'ἀρχή', 'αἰτία', 'ἄνθρωπος', 'νοῦσος', 'θάνατος', 'πᾶς', 'αὐτός', 'εἷς', 'ὑποθέμενοι', 'πολύς', 'λέγω', 'καταφανής', 'εἰμί', 'ἁμαρτάνω', 'ἄξιος', 'μέμφομαι', 'τέχνη', 'εἰμί', 'χράω', 'πᾶς', 'μέγας', 'τιμάω', 'ἀγαθός', 'χειροτέχνης', 'δημιουργός'], ['δημιουργός', 'φαῦλος', 'πολύς', 'διαφέροντες'], ['εἰμί', 'ἰατρικός', 'αὐτός', 'ἔσκεπτο', 'εὑρίσκω', 'μηδείς', 'εἰμί', 'πᾶς', 'ἄπειρος', 'ἀνεπιστήμων', 'εἰμί', 'τύχη', 'κάμνω', 'διοικεῖτο'], ['ἔχω', 'ἄλλος', 'τέχνη', 'δημιουργός', 'πολύς', 'διαφέρω', 'χείρ', 'γνώμη', 'ἰητρικῆς'], ['ἀξιόω', 'κενός', 'ὑπόθεσις', 'δέω', 'ἀφανέα', 'ἀπορεόμενα', 'ἀνάγκη', 'ἐπιχειρέω', 'λέγω', 'ὑπόθεσις', 'χράω', 'οἷος', 'μετέωρος', 'γῆ'], ['τὶς', 'λέγω', 'γιγνώσκω', 'ἔχω', 'λέγω', 'ἀκούω', 'δῆλος', 'εἰμί', 'ἀληθής', 'εἰμί'], ['χρή', 'ἀνενέγκαντα', 'οἶδα', 'σαφής'], ['ἰατρικός', 'πᾶς', 'ὑπάρχω', 

In [157]:
c_hippocraticum["wordcount"].sum()

333443

In [158]:
c_hippocraticum["lemmata_repl"].apply(lambda x: len(x)).sum()

171332

In [159]:
# perhaps we will also explore our subcorpora at some point
subcorpora = {}
for cat in c_hippocraticum["work_cat_linka"].unique():
    subcorpora[cat] = get_flat_sentences(c_hippocraticum[c_hippocraticum["work_cat_linka"]==cat]["lemmatized_sentences_repl"])

for key in subcorpora.keys():
    print(key, len(subcorpora[key]))

Theoretical 2501
Other 16911
Practical 5044


In [507]:
#create gensim dictionary for our list of sentences
dictionary_sents = corpora.Dictionary(docs)

there is a number of methods to be applied upon a dictionary object:
see: https://radimrehurek.com/gensim/corpora/dictionary.html

First of all, you can inspect it as a standard dictionary object


In [161]:
# uncomment below to print the whole dictionary
# dictionary

In [508]:
# as such, it is organized by ids
dict(list(dictionary_sents.items())[:10])

{0: 'αἰτία',
 1: 'αὐτός',
 2: 'βραχύς',
 3: 'γράφω',
 4: 'δημιουργός',
 5: 'εἰμί',
 6: 'εἷς',
 7: 'θάνατος',
 8: 'θερμός',
 9: 'καταφανής'}

In [509]:
# but you can access it reveresly by applying token2id method
dict(list(dictionary_sents.token2id.items())[:10])

{'αἰτία': 0,
 'αὐτός': 1,
 'βραχύς': 2,
 'γράφω': 3,
 'δημιουργός': 4,
 'εἰμί': 5,
 'εἷς': 6,
 'θάνατος': 7,
 'θερμός': 8,
 'καταφανής': 9}

In [510]:
# collection_frequencies
dictionary_sents.cfs[7] # how many instances of word with id 7 (= "θάνατος")

81

In [511]:
dictionary_sents.dfs[7] # how many documents contain the word with id 7 (= "θάνατος")

81

In [512]:
dictionary_sents.num_pos # previously 167358

171332

In [513]:
len(dictionary_sents.keys())

25275

In [168]:
sent = docs[20]
sent

['ἀνάγκη',
 'ἰατρικός',
 'ποιέω',
 'ζητέω',
 'εὑρίσκω',
 'ἄνθρωπος',
 'κάμνω',
 'ταὐτός',
 'προσφερομένοισι',
 'ὑγιαίνοντες',
 'συμφέρω',
 'συμφέρω']

In [514]:
# doc2bow = document to (term, tf) tuples, i.e. bag-of-words
dictionary_sents.doc2bow(sent)

[(28, 1),
 (40, 1),
 (41, 1),
 (47, 1),
 (57, 1),
 (72, 1),
 (118, 1),
 (119, 2),
 (122, 1),
 (123, 1),
 (124, 1)]

In [515]:
# we can use our gensim dictionary

# as a corpus, we cannot use the Gensim default BoW model,

# we just need the words replaced by values

corpus_bow = [dictionary_sents.doc2bow(sent) for sent in docs]
corpus_idx = [dictionary_sents.doc2idx(sent) for sent in docs]

# LSA with  sklearn

In [573]:
# build gensim dictionary on the basis of whole works
lemmata = c_hippocraticum["lemmata_repl"].tolist()
dictionary = corpora.Dictionary(lemmata)
# words in at least 2 works
# with the number of sentences in which they appear
dictionary_tup = [(len([doc for doc in docs if dictionary[word_id] in doc]), dictionary[word_id]) for word_id in dictionary if dictionary.dfs[word_id] >= 2]
# n sorted words with highest frequency
n = 1000
vocabulary = [tup[1] for tup in sorted(dictionary_tup, reverse=True)[:n]]
freqs = [tup[0] for tup in sorted(dictionary_tup, reverse=True)[:n]]

In [574]:
sorted(dictionary_tup, reverse=True)[:10]

[(3315, 'εἰμί'),
 (2829, 'γίγνομαι'),
 (2351, 'οὗτος'),
 (2103, 'πολύς'),
 (1775, 'ἔχω'),
 (1224, 'ἄλλος'),
 (1169, 'σῶμα'),
 (1129, 'ποιέω'),
 (1030, 'χρή'),
 (1010, 'αὐτός')]

In [575]:
len(vocabulary)

1000

In [576]:
for word in key_words:
    if word in vocabulary:
        print(word)

λύπ*
ἄλγ*
ὀδύν*
πόνο*


In [577]:
vocabulary[:10]

['εἰμί',
 'γίγνομαι',
 'οὗτος',
 'πολύς',
 'ἔχω',
 'ἄλλος',
 'σῶμα',
 'ποιέω',
 'χρή',
 'αὐτός']

In [578]:
docs[0]

['ὁπόσος',
 'ἐπιχειρέω',
 'ἰητρικῆς',
 'λέγω',
 'γράφω',
 'ὑπόθεσις',
 'αὐτός',
 'ὑποθέμενοι',
 'λόγος',
 'θερμός',
 'ψυχρός',
 'ὑγρός',
 'ξηρός',
 'ἄλλος',
 'ἐθέλω',
 'βραχύς',
 'ἄγω',
 'ἀρχή',
 'αἰτία',
 'ἄνθρωπος',
 'νοῦσος',
 'θάνατος',
 'πᾶς',
 'αὐτός',
 'εἷς',
 'ὑποθέμενοι',
 'πολύς',
 'λέγω',
 'καταφανής',
 'εἰμί',
 'ἁμαρτάνω',
 'ἄξιος',
 'μέμφομαι',
 'τέχνη',
 'εἰμί',
 'χράω',
 'πᾶς',
 'μέγας',
 'τιμάω',
 'ἀγαθός',
 'χειροτέχνης',
 'δημιουργός']

In [579]:
vec_tfidf = TfidfVectorizer(vocabulary=vocabulary, token_pattern=r"\w+\*?") ### initiaze the model
vec_bow = CountVectorizer(vocabulary=vocabulary, token_pattern=r"\w+\*?")

tfidf = vec_tfidf.fit_transform([" ".join(sentence) for sentence in docs]) ### run the model
bow = vec_bow.fit_transform([" ".join(sentence) for sentence in docs]) ### run the model

cooc  = bow.T * bow
cooc.setdiag(freqs) # word-word coocurrence matrix
cooc_df = pd.DataFrame(cooc.todense() / len(docs), columns=vocabulary, index=vocabulary)


Changing the sparsity structure of a csc_matrix is expensive. lil_matrix is more efficient.



In [580]:
cooc_df

Unnamed: 0,εἰμί,γίγνομαι,οὗτος,πολύς,ἔχω,ἄλλος,σῶμα,ποιέω,χρή,αὐτός,...,ῥοφήμασι,ὑποτίθημι,ὄγκος,ὁρῇν,ὀῤῥὸν,Ἕλλην,ἕπομαι,ἐρυσίπελας,ἐπάγω,ἐοῦσιν
εἰμί,0.135550,0.025515,0.022571,0.023266,0.018768,0.014230,0.014475,0.008505,0.010345,0.012512,...,0.000041,0.000204,0.000245,0.000164,0.000368,0.000000,0.000082,0.000123,0.000327,0.000082
γίγνομαι,0.025515,0.115677,0.020077,0.022448,0.012226,0.009773,0.013739,0.005275,0.003721,0.009446,...,0.000041,0.000000,0.000000,0.000041,0.000000,0.000000,0.000041,0.000409,0.000245,0.000491
οὗτος,0.022571,0.020077,0.096132,0.013494,0.010672,0.007565,0.005357,0.008301,0.005561,0.004702,...,0.000082,0.000164,0.000041,0.000041,0.000123,0.000000,0.000123,0.000041,0.000327,0.000368
πολύς,0.023266,0.022448,0.013494,0.085991,0.012512,0.009486,0.009650,0.006133,0.005357,0.005561,...,0.000082,0.000082,0.000164,0.000041,0.000164,0.000000,0.000041,0.000245,0.000368,0.000082
ἔχω,0.018768,0.012226,0.010672,0.012512,0.072579,0.008178,0.008137,0.004130,0.006093,0.005520,...,0.000204,0.000082,0.000123,0.000000,0.000082,0.000000,0.000082,0.000204,0.000245,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Ἕλλην,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000859,0.000000,0.000000,0.000000,0.000000
ἕπομαι,0.000082,0.000041,0.000123,0.000041,0.000082,0.000123,0.000164,0.000041,0.000041,0.000123,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000859,0.000000,0.000000,0.000000
ἐρυσίπελας,0.000123,0.000409,0.000041,0.000245,0.000204,0.000000,0.000123,0.000041,0.000000,0.000082,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000859,0.000000,0.000000
ἐπάγω,0.000327,0.000245,0.000327,0.000368,0.000245,0.000082,0.000123,0.000082,0.000000,0.000041,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000859,0.000000


In [581]:
pmi_rows_list = []
for column in cooc_df.columns:
    freq_a = cooc_df.at[column, column]
    pmi_row = []
    for term in cooc_df.index:
        freq_ab = cooc_df.at[column, term]
        if freq_ab > 0:
            freq_b = cooc_df.at[term, term]
            pmi = math.log(freq_ab / (freq_a * freq_b))
        else:
            pmi = 0
        pmi_row.append(pmi)
    pmi_rows_list.append(pmi_row)

In [582]:
pmi_matrix = pd.DataFrame(pmi_rows_list, columns=vocabulary, index=vocabulary)

In [583]:
pd.DataFrame(pmi_matrix)

Unnamed: 0,εἰμί,γίγνομαι,οὗτος,πολύς,ἔχω,ἄλλος,σῶμα,ποιέω,χρή,αὐτός,...,ῥοφήμασι,ὑποτίθημι,ὄγκος,ὁρῇν,ὀῤῥὸν,Ἕλλην,ἕπομαι,ἐρυσίπελας,ἐπάγω,ἐοῦσιν
εἰμί,1.998418,0.486890,0.549370,0.691178,0.645912,0.740741,0.803811,0.306868,0.594493,0.804297,...,-1.046104,0.563333,0.745655,0.340190,1.151120,0.000000,-0.352957,0.052508,1.033337,-0.352957
γίγνομαι,0.486890,2.156952,0.590800,0.813931,0.375840,0.523536,0.910159,-0.012323,-0.269502,0.681664,...,-0.887570,0.000000,0.000000,-0.887570,0.000000,0.000000,-0.887570,1.415015,0.904189,1.597337
οὗτος,0.549370,0.590800,2.342035,0.490007,0.425000,0.452511,0.153328,0.626153,0.317376,0.169261,...,-0.009340,0.683807,-0.702488,-0.702488,0.396125,0.000000,0.396125,-0.702488,1.376954,1.494737
πολύς,0.691178,0.813931,0.490007,2.453511,0.695540,0.790369,0.853439,0.435058,0.391394,0.448460,...,0.102135,0.102135,0.795283,-0.591012,0.795283,0.000000,-0.591012,1.200748,1.606213,0.102135
ἔχω,0.645912,0.375840,0.425000,0.695540,2.623075,0.811513,0.852476,0.209108,0.689707,0.610644,...,1.187991,0.271700,0.677165,0.000000,0.271700,0.000000,0.271700,1.187991,1.370312,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Ἕλλην,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,7.060108,0.000000,0.000000,0.000000,0.000000
ἕπομαι,-0.352957,-0.887570,0.396125,-0.591012,0.271700,1.048841,1.382499,0.031021,0.122794,1.241015,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,7.060108,0.000000,0.000000,0.000000
ἐρυσίπελας,0.052508,1.415015,-0.702488,1.200748,1.187991,0.000000,1.094817,0.031021,0.000000,0.835550,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,7.060108,0.000000,0.000000
ἐπάγω,1.033337,0.904189,1.376954,1.606213,1.370312,0.643376,1.094817,0.724168,0.000000,0.142403,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,7.060108,0.000000


In [584]:
key_words

['λύπ*', 'ἄλγ*', 'ὀδύν*', 'πόνο*']

In [590]:
keyterms_pmi_20 = []
for keyterm in key_words:
    keyterm_pmi_20 = pmi_matrix.sort_values(keyterm, ascending=False).index[:20].tolist() #, ascending=False) # ["πόνο*"][:10]

SyntaxError: invalid syntax (<ipython-input-590-dcecd17a5361>, line 1)

In [319]:
svd = TruncatedSVD(n_components=50)
lsa = svd.fit_transform(bow)
lsa_doc = pd.DataFrame(lsa) #, columns=vocabulary)
lsa_doc

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
0,2.719352,-0.946350,0.644003,-0.479278,-0.000591,0.253808,1.235649,-1.173183,-0.314559,1.947862,...,-0.025403,-0.206363,0.103598,0.526784,-0.259056,0.166869,0.056904,0.132264,0.426726,0.165400
1,0.289225,0.205214,0.664262,-0.536717,-0.299348,-0.105460,-0.120221,-0.014899,-0.004718,-0.006088,...,-0.009254,-0.005656,0.002487,0.016351,-0.004690,0.019584,-0.001093,-0.003512,-0.003511,0.000941
2,2.207826,-1.955453,-0.699439,-0.256683,-0.194288,0.068972,-0.001774,-0.300178,0.069948,0.941525,...,-0.048961,-0.036476,-0.001234,0.006582,0.006286,-0.003437,0.033407,0.066411,0.041445,-0.034696
3,0.670880,0.280523,1.021210,-0.293352,0.601927,-0.517259,0.100278,-0.520457,-0.553439,-0.268679,...,0.040610,-0.193855,0.149862,0.085975,-0.014085,-0.173272,0.126251,-0.047112,0.009712,-0.077423
4,0.182567,0.019514,0.061156,0.050322,0.037277,0.036538,0.116766,-0.090075,0.054945,0.072980,...,0.083668,-0.301222,0.143821,0.041089,0.378318,0.283328,-0.057332,0.166860,0.022792,0.033167
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24451,0.042207,0.001500,0.018703,0.022013,-0.006753,-0.011674,0.043674,-0.049720,0.015438,0.026815,...,0.117181,-0.002730,0.049617,-0.016071,0.030298,0.063258,-0.106945,0.022816,0.042366,0.021482
24452,1.357183,-0.311095,0.746102,0.244192,-0.810402,-0.166962,0.020705,-0.136485,0.623211,-0.311486,...,-0.099211,-0.004870,0.050922,0.043304,0.008461,0.035353,-0.020094,0.013599,0.064094,-0.036680
24453,0.091251,-0.003093,0.025745,0.017456,0.044500,-0.005007,0.096511,-0.037041,0.010544,0.049355,...,-0.025827,0.135412,-0.032259,0.080318,-0.107327,-0.007146,0.147149,0.008024,-0.000520,0.076246
24454,1.223910,0.038951,-0.737959,-0.182085,-0.113153,-0.014372,-0.042287,-0.006245,-0.047540,-0.032605,...,-0.151843,-0.010663,0.058249,-0.001547,0.052898,0.000391,-0.016651,0.054690,0.087655,-0.002582


In [320]:
lsa_word = pd.DataFrame(svd.components_, columns=vocabulary).T #, columns=vocabulary)
lsa_word

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
εἰμί,0.660445,-0.662520,-0.258868,-0.117724,-0.103106,-0.005671,-0.102490,0.020527,-0.024862,-0.038292,...,-0.011008,-0.003035,0.003315,-0.002078,0.000977,-0.001378,0.002972,0.011551,0.002115,-0.013920
γίγνομαι,0.413496,0.689925,-0.536049,-0.110476,-0.037957,0.036677,-0.115795,-0.035614,-0.017448,-0.055460,...,-0.015646,-0.013055,-0.017801,-0.019137,0.002148,0.003137,0.009506,0.000444,-0.003195,-0.011386
οὗτος,0.241928,0.116833,0.222601,0.774644,-0.462150,-0.175451,-0.086483,0.074309,-0.063097,0.019244,...,-0.013059,-0.012242,0.010411,-0.004257,-0.008454,0.012213,-0.007024,0.004380,0.008980,-0.004306
πολύς,0.289225,0.205213,0.664263,-0.536717,-0.299348,-0.105460,-0.120221,-0.014904,-0.004724,-0.006078,...,-0.009144,-0.005653,0.003566,0.016516,-0.004570,0.019937,-0.001299,-0.003636,-0.003621,0.001165
ἔχω,0.194627,0.050946,0.231408,0.177992,0.740868,-0.361927,-0.368411,-0.005295,-0.012124,-0.034553,...,-0.004066,-0.000882,-0.004313,-0.001601,-0.006967,0.006865,-0.009224,-0.010087,-0.001472,-0.006798
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ῥόφημα,0.001417,0.000144,0.001124,0.000584,0.001370,0.003197,-0.001515,0.000207,-0.000680,-0.000399,...,0.001397,0.001769,-0.003753,0.003845,0.003245,-0.000283,0.001488,-0.001669,-0.002268,0.001928
ῥοφέειν,0.000915,0.000189,0.002246,0.001835,-0.000050,0.007216,-0.002016,0.001664,-0.002510,0.000173,...,-0.002507,-0.002234,-0.002181,0.005391,0.007207,-0.000199,-0.000226,-0.001971,0.000661,0.001899
ὑδατώδεα,0.000915,0.000111,-0.000151,-0.000322,-0.000233,0.000286,0.000440,-0.000508,-0.000145,0.000058,...,0.000076,0.000057,-0.001292,-0.002224,-0.002777,-0.001124,0.000232,0.001150,0.000813,-0.000635
ἥδομαι,0.001328,0.000115,0.000702,0.003002,-0.000545,-0.000220,0.000615,0.001289,-0.000830,0.001093,...,0.002716,0.003273,-0.003212,0.001400,-0.001217,0.000011,0.001779,-0.000107,-0.002305,0.000193


In [321]:
lsa_word_cos = pd.DataFrame(cosine_similarity(lsa_word.to_numpy()).round(5), columns=vocabulary, index=vocabulary)
lsa_word_cos

Unnamed: 0,εἰμί,γίγνομαι,οὗτος,πολύς,ἔχω,ἄλλος,σῶμα,ποιέω,αὐτός,χρή,...,δάκρυον,γόνος,γλυκείης,βαρύνω,αὐτέῃ,ῥόφημα,ῥοφέειν,ὑδατώδεα,ἥδομαι,ἔνι
εἰμί,1.00000,-0.00276,-0.00081,-0.00079,-0.00136,-0.00102,-0.00079,-0.00205,-0.00051,-0.00130,...,-0.01590,-0.01631,-0.01015,-0.01672,-0.00210,0.02315,-0.00088,0.02725,-0.02608,-0.04566
γίγνομαι,-0.00276,1.00000,-0.00179,-0.00105,-0.00217,-0.00079,-0.00035,-0.00361,-0.00177,-0.00117,...,0.05500,0.04885,-0.00707,-0.01788,-0.09643,0.01245,-0.02740,0.02460,-0.02684,-0.06344
οὗτος,-0.00081,-0.00179,1.00000,-0.00075,-0.00037,-0.00145,-0.00040,-0.00104,-0.00218,-0.00131,...,0.01135,0.00913,0.05336,0.01578,-0.02438,-0.02024,0.07618,0.00503,0.24163,0.12392
πολύς,-0.00079,-0.00105,-0.00075,1.00000,-0.00058,-0.00089,-0.00111,-0.00103,-0.00030,-0.00108,...,0.14317,-0.01664,-0.01339,0.00237,0.05066,0.01128,0.03162,0.01743,-0.09267,0.05608
ἔχω,-0.00136,-0.00217,-0.00037,-0.00058,1.00000,-0.00063,-0.00183,-0.00128,-0.00096,-0.00300,...,-0.01059,0.15464,0.00640,0.16113,0.01695,0.07058,-0.04983,-0.03869,-0.02944,-0.00923
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ῥόφημα,0.02315,0.01245,-0.02024,0.01128,0.07058,-0.00139,-0.06455,-0.00682,0.00111,-0.00336,...,0.08414,-0.01160,-0.04153,0.06971,-0.01175,1.00000,0.56482,0.30664,0.31992,-0.06663
ῥοφέειν,-0.00088,-0.02740,0.07618,0.03162,-0.04983,-0.03042,-0.01789,-0.04978,0.01695,-0.03750,...,-0.03686,0.01676,0.30478,0.20588,0.08807,0.56482,1.00000,-0.10749,-0.08645,-0.14314
ὑδατώδεα,0.02725,0.02460,0.00503,0.01743,-0.03869,0.07403,-0.03489,0.02076,0.02557,-0.01687,...,0.16403,0.00537,0.00007,-0.12211,0.01455,0.30664,-0.10749,1.00000,0.14047,0.09611
ἥδομαι,-0.02608,-0.02684,0.24163,-0.09267,-0.02944,-0.03528,0.03276,-0.07135,-0.00063,-0.03242,...,0.30579,-0.01606,-0.20745,0.15248,-0.15925,0.31992,-0.08645,0.14047,1.00000,0.05576


In [322]:
lsa_word_cos.sort_values("αἰτία", ascending=False)["αἰτία"][1:11]

βίος         0.65026
ἰατρικός     0.64816
ἐπίσταμαι    0.62944
τέχνη        0.62456
πρᾶγμα       0.62205
νοῦσος       0.58908
ἰδιώτης      0.58631
ὑγιαίνω      0.57987
εἶδος        0.57907
ἰατρός       0.56876
Name: αἰτία, dtype: float64

In [323]:
lsa_word_cos.sort_values("ὀδύν*", ascending=False)["ὀδύν*"][1:11]

γονεύς          0.59323
μήτηρ           0.57373
καταμηνίων      0.51599
ἔρχομαι         0.50168
παραγίγνομαι    0.47652
νέω             0.45775
παιδίον         0.45711
αἴτιος          0.45556
πνεῦμα          0.45100
βόρειος         0.43999
Name: ὀδύν*, dtype: float64

In [324]:
lsa_word_cos.sort_values("πόνο*", ascending=False)["πόνο*"][1:11]

τίκτω         0.53473
γυνή          0.52920
κατάκειμαι    0.44862
ἀνήρ          0.44372
παρθένος      0.42499
σώζω          0.41553
λόγος         0.39876
ἄρσην         0.39855
σκοπέω        0.39723
καταμηνίων    0.39321
Name: πόνο*, dtype: float64

In [121]:
def get_most_similar(model_df, target_term, number):
    all_similar = []
    for term in model_df.columns:
        similarity = (term, cosine_similarity([model_df[target_term],  model_df[term]])[0][1])
        all_similar.append(similarity)
    return sorted(all_similar, key=lambda number: number[1], reverse=True)[1:number]

In [122]:
key_words = ['λύπ*', 'ἄλγ*', 'ὀδύν*', 'πόνο*']

In [131]:
for word in key_words:
    if word in lsa_word_cos.index:
        print(word)

λύπ*
ἄλγ*
ὀδύν*
πόνο*


In [123]:
get_most_similar(lsa_word, "αἰτία", 10)

KeyError: 'αἰτία'

In [90]:
import json

In [91]:
publicfolder = "8fe7d59de1eafe5f8eaebc0044534606"
morpheus_by_lemma = json.loads(requests.get("https://sciencedata.dk/public/" + publicfolder + "/morpheus_by_lemma.json").content)