In [147]:
### PREREQUISTIES
### (many used only in one notebook...)

import os
import pandas as pd
import numpy as np
import logging ### to monitor the code
from bs4 import BeautifulSoup
import pickle
import math
import random
import sys
import csv
import unicodedata
import requests
import re

import nltk
from nltk.collocations import *

import matplotlib.pyplot as plt
import seaborn as sns

from scipy import stats

import xml.etree.cElementTree as ET
from urllib.request import urlopen

import gspread
from gspread_dataframe import get_as_dataframe, set_with_dataframe
#from google.colab import auth
#from oauth2client.client import GoogleCredentials
from google.oauth2 import service_account # based on google-auth library
import sddk

In [148]:
# gensim parts
from gensim import corpora
from gensim import models
from gensim.models import CoherenceModel, LdaModel, LsiModel, HdpModel
from gensim.models.wrappers import LdaMallet

In [3]:
#!pip install anda
#from anda import gr ### the import takes substantial time, since it import a +600MB file containing ancient Greek dictionary

In [149]:
### not neccessary for reading the data, just for exporting them to sciencedata.dk
conf = sddk.configure()

sciencedata.dk username (format '123456@au.dk'): 648597@au.dk
sciencedata.dk password: ········
endpoint variable has been configured to: https://sciencedata.dk/files/


In [150]:
# to access gsheet, you need Google Service Account key json file
# I have mine located in my personal space on sciencedata.dk, so I read it from there:

# (1) read the file and parse its content
file_data = conf[0].get(conf[1] + "ServiceAccountsKey.json").json()
# (2) transform the content into crendentials object
credentials = service_account.Credentials.from_service_account_info(file_data)
# (3) specify your usage of the credentials
scoped_credentials = credentials.with_scopes(['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive'])
# (4) use the constrained credentials for authentication of gspread package
gc = gspread.Client(auth=scoped_credentials)
# (5) establish connection with spreadsheets specified by their url
PIA_data = gc.open_by_url("https://docs.google.com/spreadsheets/d/1KxOx7Be9fj3lDcEPgQhQ-Iqcn9p367-MMD6RMXe8rks/edit?usp=sharing")
PIA_overview = gc.open_by_url("https://docs.google.com/spreadsheets/d/1e94wyelg6dftQ4zxbq1xvwxWAI-BhcYXtclDW-YTnrw/edit?usp=sharing")

In [151]:
publicfolder = "31b393e2afe1ee96ce81869c7efe18cb"
c_hippocraticum = sddk.read_file("c_hippocraticum_enriched.json", "df", publicfolder)

reading file located in a public folder


In [152]:
words_to_remove = ["ἢν", "ὁκόταν"]

c_hippocraticum["lemmatized_sentences_repl"] = c_hippocraticum["lemmatized_sentences_repl"].apply(lambda sentences_list:  [[word for word in sentence if word not in words_to_remove] for sentence in sentences_list])

words_to_replace = {
    "ἔχις" : "ἔχω"}

for key in words_to_replace.keys():
    c_hippocraticum["lemmatized_sentences_repl"] = c_hippocraticum["lemmatized_sentences_repl"].apply(lambda sentences_list: [[corrections[key] if x == key else x for x in sentence] for sentence in sentences_list])

# Corpora

In [153]:
c_hippocraticum.columns

Index(['filename', 'author', 'title', 'string', 'wordcount', 'author_id',
       'doc_id', 'raw_date', 'date_avr', 'date_probs', 'date_manual',
       'provenience', 'sentences', 'lemmata', 'lemmata_wordcount',
       'lemmatized_sentences', 'n_sentences', 'lemmata_repl',
       'lemmatized_sentences_repl', 'λύπ*', 'ἄλγ*', 'ὀδύν*', 'πόνο*',
       'terms_sum', 'λύπ*_TF', 'ἄλγ*_TF', 'ὀδύν*_TF', 'πόνο*_TF', 'TF_sum',
       'work_cat_jouanna', 'work_cat_craik', 'work_cat_linka'],
      dtype='object')

In [154]:
def get_flat_sentences(series):
    sentences_list = [sent for doc in series.tolist() for sent in doc]
    return sentences_list

In [155]:
sentences_list = get_flat_sentences(c_hippocraticum["lemmatized_sentences_repl"])

In [156]:
flat_sentences = []
for sentence in sentences_list:
    flat_sentences.extend(sentence)

In [157]:
print(sentences_list[:10])

[['ὁπόσος', 'ἐπιχειρέω', 'ἰητρικῆς', 'λέγω', 'γράφω', 'ὑπόθεσις', 'λόγος', 'θερμός', 'ψυχρός', 'ὑγρός', 'ξηρός', 'ἐθέλω', 'βραχὺ', 'ἀρχην', 'αἰτία', 'ἄνθρωπος', 'νόσος', 'θάνατος', 'πᾶς', 'ἵημι', 'πολύς', 'λέγω', 'καταφανής', 'εἶμι', 'ἄξιος', 'μέμφομαι', 'τέχνη', 'χρέονταί', 'πᾶς', 'μέγας', 'τιμάω', 'ἀγαθοὺς', 'χειροτέχνης', 'δημιουργός'], ['εἶμι', 'δημιουργός', 'φαῦλος', 'πολύς'], ['ἰητρικη', 'ὅλοξ', 'μηδʼ', 'ἔσκεπτο', 'μηδʼ', 'εὑρίσκω', 'μηδείς', 'πᾶς', 'ὅμοιος', 'ἄπειροί', 'ἀνεπιστήμων', 'τύχη', 'πᾶς', 'κάμνω', 'διοικεῖτο'], ['ἔχω', 'τέχνη', 'πᾶς', 'δημιουργός', 'πολύς', 'ἀλλήλων', 'διαφέρω', 'χείρ', 'γνώμη', 'ἰητρικῆς'], ['ἀξιόω', 'αὐτην', 'κενός', 'ὑπόθεσις', 'δέω', 'ἀφανέα', 'ἀπορεόμενα', 'ἀνάγκη', 'ἐπιχειρέω', 'λέγω', 'ὑπόθεσις', 'χράομαι', 'οἷος', 'μετέωρος', 'γῆ'], ['εἶμι', 'λέγω', 'γιγνώσκω', 'ἔχω', 'οὔτʼ', 'λέγω', 'ἀκούω', 'δῆλος', 'ἐάω', 'ἀληθής'], ['χρῆ', 'ἀνενέγκαντα', 'οἶδα', 'σαφής'], ['ἰατρικός', 'πάλη', 'πᾶς', 'ὑπάρχω', 'ἀρχη', 'ὁδός', 'ἣν', 'πολύς', 'καλός', 'εὑρίσκω

In [158]:
c_hippocraticum["lemmata_repl"].apply(lambda x: len(x)).sum()

171233

In [159]:
subcorpora = {}
for cat in c_hippocraticum["work_cat_linka"].unique():
    subcorpora[cat] = get_flat_sentences(c_hippocraticum[c_hippocraticum["work_cat_linka"]==cat]["lemmatized_sentences_repl"])

In [160]:
for key in subcorpora.keys():
    print(key, len(subcorpora[key]))

Theoretical 2504
Other 16911
Practical 5044


In [161]:
#create dictionary for our list of sentences
dictionary = corpora.Dictionary(sentences_list)

there is a number of methods to be applied upon a dictionary object:
see: https://radimrehurek.com/gensim/corpora/dictionary.html

First of all, you can inspect it as a standard dictionary object


In [162]:
# uncomment below to print the whole dictionary
# dictionary

In [163]:
# as such, it is organized by ids
dict(list(dictionary.items())[:10])

{0: 'αἰτία',
 1: 'βραχὺ',
 2: 'γράφω',
 3: 'δημιουργός',
 4: 'εἶμι',
 5: 'θάνατος',
 6: 'θερμός',
 7: 'καταφανής',
 8: 'λέγω',
 9: 'λόγος'}

In [164]:
# but you can access it reveresly by applying token2id method
dict(list(dictionary.token2id.items())[:10])

{'αἰτία': 0,
 'βραχὺ': 1,
 'γράφω': 2,
 'δημιουργός': 3,
 'εἶμι': 4,
 'θάνατος': 5,
 'θερμός': 6,
 'καταφανής': 7,
 'λέγω': 8,
 'λόγος': 9}

In [165]:
# collection_frequencies
dictionary.cfs[5] # how many instances of word with id 5 (= "θάνατος")

80

In [166]:
dictionary.dfs[5] # how many documents contain the word with id 5 (= "θάνατος")

80

In [167]:
dictionary.num_pos

167358

In [168]:
len(dictionary.keys())

25059

In [169]:
sent = sentences_list[20]
sent

['αὐτη',
 'ἀνάγκη',
 'ἰητρικην',
 'ποιέω',
 'ζητηθῆναί',
 'εὑρίσκω',
 'ἄνθρωπος',
 'κάμνω',
 'ταὐτάζω',
 'προσφερομένοισι',
 'συμφέρω',
 'συμφέρω']

In [170]:
# doc2bow = document to (term, tf) tuples, i.e. bag-of-words
dictionary.doc2bow(sent)

[(23, 1),
 (34, 1),
 (35, 1),
 (57, 1),
 (127, 1),
 (128, 2),
 (131, 1),
 (132, 1),
 (133, 1),
 (134, 1),
 (135, 1)]

In [171]:
# build bag-of-word corpus for all sentences 
corpus = [dictionary.doc2bow(sent) for sent in sentences_list]

In [172]:
# look at first five sentences:
print(corpus[:5])

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 2), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 2), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1)], [(3, 1), (4, 1), (14, 1), (32, 1)], [(15, 2), (33, 1), (34, 1), (35, 1), (36, 2), (37, 1), (38, 1), (39, 1), (40, 1), (41, 1), (42, 1), (43, 1), (44, 1)], [(3, 1), (14, 1), (15, 1), (16, 1), (27, 1), (45, 1), (46, 1), (47, 1), (48, 1), (49, 1)], [(8, 1), (26, 1), (31, 2), (50, 1), (51, 1), (52, 1), (53, 1), (54, 1), (55, 1), (56, 1), (57, 1), (58, 1), (59, 1), (60, 1)]]


# TfIdf with Gensim

In [173]:
# tfidf
from gensim import models
tfidf = models.TfidfModel(corpus)

In [174]:
for sentence in tfidf[corpus][:5]:
    print(sentence)

[(0, 0.1555546501952692), (1, 0.2125392102552277), (2, 0.14848529441051223), (3, 0.22225540794565096), (4, 0.10264228523388086), (5, 0.15589305260387784), (6, 0.10264228523388086), (7, 0.20149391757656823), (8, 0.2906112067353352), (9, 0.11787475240586262), (10, 0.10397620207248393), (11, 0.2154093406534881), (12, 0.09303236591175755), (13, 0.14045464728907694), (14, 0.08429582005066466), (15, 0.16718400676771392), (16, 0.1396926103665917), (17, 0.275264031111707), (18, 0.23749992601080708), (19, 0.275264031111707), (20, 0.11380838096450957), (21, 0.24533668588259758), (22, 0.1698084756807977), (23, 0.10077892357008249), (24, 0.17410245261873983), (25, 0.14084381167298515), (26, 0.19808434294520663), (27, 0.18757849449964836), (28, 0.12665539088532585), (29, 0.10428171230540707), (30, 0.1088417514909795), (31, 0.22645463333214763)]
[(3, 0.6890894435164647), (4, 0.31823619441633133), (14, 0.2613540892722885), (32, 0.5963015205567)]
[(15, 0.2042900925021272), (33, 0.3363582167069986), (3

# LSI model

In [175]:
num_topics = 10
lsimodel = LsiModel(corpus=corpus, num_topics=num_topics, id2word=dictionary)

In [104]:
lsimodel.show_topic(0)

[('γίγνομαι', 0.7277635755289887),
 ('ἔχω', 0.22411008311265845),
 ('σῶμα', 0.2101362852726716),
 ('πολύς', 0.16565138939592172),
 ('πᾶς', 0.1480166012949689),
 ('κοιλία', 0.11949590913050812),
 ('νόσος', 0.1056036564894304),
 ('κεφαλή', 0.10433806028914698),
 ('πυρετός', 0.098731644461411),
 ('ὀδύν*', 0.09764629337273882)]

In [143]:
topics_dict = {} 
for n in range(num_topics):
    topics_dict["topic " + str(n)] = [tup[0] for tup in lsimodel.show_topic(n, topn=10)]
lsi_topics = pd.DataFrame(topics_dict)
lsi_topics

Unnamed: 0,topic 0,topic 1,topic 2,topic 3,topic 4,topic 5,topic 6,topic 7,topic 8,topic 9
0,γίγνομαι,γίγνομαι,οἶνος,ἔχω,πολύς,πᾶς,χρῆ,κοιλία,ὀστέον,κοιλία
1,ἔχω,ἔχω,σῶμα,πᾶς,πᾶς,σῶμα,ὀστέον,σῶμα,ἡμέρα,κεφαλή
2,σῶμα,οἶνος,πίνω,σῶμα,σῶμα,κοιλία,πᾶς,πυρετός,κεφαλή,πυρετός
3,πολύς,σῶμα,ὕδωρ,πολύς,θερμός,ὀστέον,ποιέω,κεφαλή,ποιέω,ἡμέρα
4,πᾶς,ὕδωρ,ἔχω,ὀδύν*,ὅσος,χρῆ,ἔχω,ὀδύν*,χρῆ,ποιέω
5,κοιλία,χρῆ,δίδωμι,κοιλία,οἶνος,πολύς,ὕδωρ,ὕδωρ,ὕδωρ,ὕδωρ
6,νόσος,πίνω,πᾶς,ποιέω,χρόνος,αἷμα,ἡμέρα,ἡμέρα,κοιλία,ὀδύν*
7,κεφαλή,πᾶς,ὅσος,γίγνομαι,ποιέω,μέγας,ἕλκος,ἔχω,ἕλκος,σῶμα
8,πυρετός,πολύς,τρίβω,ἄνθρωπος,πυρετός,ὑγρός,πολύς,ὀστέον,σάρξ,ἔχω
9,ὀδύν*,ποιέω,ἡμέρα,νόσος,ἔχω,εἶμι,σῶμα,πολύς,νόσος,αἷμα


In [145]:
lsimodel.print_topic(4)

'0.872*"πολύς" + -0.324*"πᾶς" + -0.201*"σῶμα" + 0.101*"θερμός" + -0.089*"ὅσος" + -0.085*"οἶνος" + 0.073*"χρόνος" + -0.069*"ποιέω" + 0.065*"πυρετός" + -0.058*"ἔχω"'

In [146]:
lsimodel_df = pd.DataFrame(lsimodel.get_topics(), columns=dictionary.values()).T

In [141]:
 # get first topic, i.e. sort words by largest 
lsimodel_df.sort_values(4, ascending=False).head(20)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
πολύς,0.165651,0.151776,0.019103,0.179733,0.871728,0.121368,0.125617,0.144609,-0.010803,0.038796
θερμός,0.072826,0.126653,-0.089586,0.013539,0.101332,-0.036864,0.079609,0.098633,-0.068343,0.030392
χρόνος,0.0971,0.042586,0.012096,0.048816,0.073308,0.049114,-0.02784,-0.041085,0.082053,-0.044126
πυρετός,0.098732,0.038761,0.045069,-0.065405,0.064801,-0.015696,0.099943,-0.302882,0.033145,-0.321265
νόσος,0.105604,0.018105,0.045883,0.077487,0.047291,-0.006255,-0.031741,-0.006536,0.132005,-0.04232
ὀλίγος,0.070111,0.074377,-0.079759,0.042655,0.04555,-0.002944,-0.042544,-0.04186,0.069752,-0.031475
λούω,0.018422,0.044423,-0.028645,-0.015445,0.042583,-0.005675,0.004637,0.012272,0.006214,-0.006389
ὀδύν*,0.097646,0.074928,0.029191,-0.139421,0.042545,-0.024515,0.089411,-0.185741,-0.049531,-0.204421
ἦδος,0.025168,0.010906,-0.000517,0.014777,0.031535,0.012882,-0.012989,-0.005639,-0.00076,-0.005194
κοιλία,0.119496,0.09996,0.124116,0.112295,0.024656,-0.261902,0.017895,-0.589007,-0.148471,0.59885


In [144]:
lsimodel_df_sorted = lsimodel_df.sort_values(4, ascending=False)  
len(lsimodel_df_sorted.loc[:"χρόνος"])

3

In [111]:
words = ['λύπ*', 'ἄλγ*', 'ὀδύν*', 'πόνο*']

In [127]:
len(lsimodel_df)

25059

In [140]:
word_positions = {}
for word in words:
    positions = []
    for n in range(num_topics):
        lsimodel_df_sorted = lsimodel_df.sort_values(n, ascending=False)  
        pos = len(lsimodel_df_sorted.loc[:word])
        positions.append(pos)
    word_positions[word] = positions
word_positions

{'λύπ*': [360, 628, 166, 226, 424, 24759, 19586, 1833, 217, 902],
 'ἄλγ*': [59, 66, 37, 25050, 24950, 25022, 18, 25051, 25028, 25051],
 'ὀδύν*': [10, 18, 20, 25058, 8, 25040, 8, 25056, 25046, 25056],
 'πόνο*': [20, 28, 6, 9, 12, 25054, 16, 25052, 11, 25053]}

In [126]:
word_positions = {}
for word in words:
    positions = []
    for n in range(num_topics):
        lsimodel_df_sorted = lsimodel_df.sort_values(n, ascending=False)  
        pos = len(lsimodel_df_sorted.loc[:word])
        positions.append(len(lsimodel_df.loc[:word]))
    word_positions[word] = positions
word_positions

{'λύπ*': [360, 628, 166, 226, 424, 24759, 19586, 1833, 217, 902],
 'ἄλγ*': [59, 66, 37, 25050, 24950, 25022, 18, 25051, 25028, 25051],
 'ὀδύν*': [10, 18, 20, 25058, 8, 25040, 8, 25056, 25046, 25056],
 'πόνο*': [20, 28, 6, 9, 12, 25054, 16, 25052, 11, 25053]}

In [133]:
columns = ["pos in top " + str(n) for n in range(num_topics)]

word_positions_df = pd.DataFrame(word_positions).T
word_positions_df.columns = columns
word_positions_df

Unnamed: 0,pos in top 0,pos in top 1,pos in top 2,pos in top 3,pos in top 4,pos in top 5,pos in top 6,pos in top 7,pos in top 8,pos in top 9
λύπ*,360,628,166,226,424,24759,19586,1833,217,902
ἄλγ*,59,66,37,25050,24950,25022,18,25051,25028,25051
ὀδύν*,10,18,20,25058,8,25040,8,25056,25046,25056
πόνο*,20,28,6,9,12,25054,16,25052,11,25053


In [131]:
lsimodel_df[0].index

Index(['κοιλία', 'ποιέω', 'ὕδωρ', 'ἔχω', 'γίγνομαι', 'ἄνθρωπος', 'ὅσος',
       'ὑγρός', 'καθίημι', 'ἄνοος',
       ...
       'ὀξύς', 'ἄλγ*', 'οἶνος', 'πόνο*', 'αἷμα', 'σῶμα', 'ὀδύν*', 'ἡμέρα',
       'πυρετός', 'κεφαλή'],
      dtype='object', length=25059)

# HDP model

In [69]:
hdpmodel = HdpModel(corpus=corpus, id2word=dictionary)

In [70]:
hdpmodel_df = pd.DataFrame(hdpmodel.get_topics(), columns=dictionary.values()).T

In [72]:
hdpmodel

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,140,141,142,143,144,145,146,147,148,149
αἰτία,0.000043,0.000052,0.000006,0.000141,0.000038,0.000148,0.000093,0.000172,0.000126,0.000093,...,0.000051,0.000177,0.000096,0.000047,0.000002,0.000058,4.999596e-05,0.000058,0.000056,0.000212
βραχὺ,0.000024,0.000062,0.000030,0.000056,0.000100,0.000099,0.000007,0.000017,0.000037,0.000039,...,0.000061,0.000035,0.000050,0.000115,0.000037,0.000041,1.651723e-05,0.000122,0.000012,0.000029
γράφω,0.000125,0.000280,0.000039,0.000060,0.000369,0.000073,0.000084,0.000113,0.000085,0.000125,...,0.000075,0.000083,0.000065,0.000020,0.000089,0.000120,6.933962e-05,0.000070,0.000001,0.000058
δημιουργός,0.000003,0.000030,0.000037,0.000016,0.000102,0.000083,0.000014,0.000061,0.000098,0.000089,...,0.000012,0.000012,0.000068,0.000071,0.000030,0.000005,1.055637e-05,0.000073,0.000010,0.000107
εἶμι,0.000519,0.000608,0.000737,0.000799,0.000635,0.000597,0.000464,0.000806,0.000671,0.000605,...,0.000262,0.000231,0.000252,0.000266,0.000199,0.000251,2.404315e-04,0.000203,0.000192,0.000254
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
χρηστοὺς,0.000045,0.000090,0.000019,0.000021,0.000016,0.000023,0.000001,0.000057,0.000020,0.000018,...,0.000008,0.000061,0.000009,0.000033,0.000029,0.000039,2.835586e-06,0.000041,0.000036,0.000014
ἡρμοσμένῳ,0.000001,0.000002,0.000010,0.000002,0.000095,0.000020,0.000031,0.000081,0.000027,0.000152,...,0.000019,0.000092,0.000059,0.000079,0.000073,0.000057,8.874343e-07,0.000047,0.000037,0.000016
σύμβουλος,0.000030,0.000027,0.000029,0.000018,0.000023,0.000025,0.000003,0.000094,0.000084,0.000023,...,0.000006,0.000052,0.000016,0.000003,0.000022,0.000013,2.127276e-06,0.000043,0.000027,0.000030
εὐμοιρήσομεν,0.000002,0.000067,0.000007,0.000022,0.000003,0.000024,0.000016,0.000013,0.000031,0.000020,...,0.000014,0.000045,0.000028,0.000050,0.000185,0.000007,1.342345e-04,0.000004,0.000051,0.000085


# LSA with  sklearn

In [177]:
# we can use our gensim dictionary

# as a corpus, we cannot use the Gensim default BoW model,

# we just need the words replaced by values

corpus_bow = [dictionary.doc2bow(sent) for sent in sentences_list]
corpus_idx = [dictionary.doc2idx(sent) for sent in sentences_list]

In [180]:
len(corpus_idx)

24459

In [None]:
### lsa alternative
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.decomposition import TruncatedSVD 
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import euclidean_distances

In [None]:
vec =  TfidfVectorizer(vocabulary=vocabulary) ### initiaze the model
X = vec.fit_transform([" ".join(sentence) for sentence in list_of_lists]) ### run the model
Xc = (X.T * X)
svd = TruncatedSVD(n_components=25, n_iter=5, random_state=42)
svd.fit(Xc)
cooc = pd.DataFrame(Xc.toarray(), columns=vec.get_feature_names(), index=vec.get_feature_names())
lsa_model_data = pd.DataFrame(svd.components_, columns=vec.get_feature_names())


def get_most_similar(model_df, target_term, number):
  all_similar = []
  for term in model_df.columns:
    similarity = (term, cosine_similarity([model_df[target_term],  model_df[term]])[0][1])
    all_similar.append(similarity)
  return sorted(all_similar, key=lambda number: number[1], reverse=True)[1:number]

model_john_lsa, cooc_john = lsa_model(sentences_john, words_john)



In [None]:
lsamodel, cooc = lsa_model( vocabulary=)

