In [1]:
### PREREQUISTIES
### (many used only in one notebook...)

import os
import pandas as pd
import numpy as np
import logging ### to monitor the code
from bs4 import BeautifulSoup
import pickle
import math
import random
import sys
import csv
import unicodedata
import requests
import re

import nltk
from nltk.collocations import *

import matplotlib.pyplot as plt
import seaborn as sns

from scipy import stats

import xml.etree.cElementTree as ET
from urllib.request import urlopen

import gspread
from gspread_dataframe import get_as_dataframe, set_with_dataframe
#from google.colab import auth
#from oauth2client.client import GoogleCredentials
from google.oauth2 import service_account # based on google-auth library
import sddk

In [2]:
# gensim parts
from gensim import corpora
from gensim import models

### lsa alternative
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.decomposition import TruncatedSVD 
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import euclidean_distances

In [3]:
#!pip install anda
#from anda import gr ### the import takes substantial time, since it import a +600MB file containing ancient Greek dictionary

In [4]:
### not neccessary for reading the data, just for exporting them to sciencedata.dk
conf = sddk.configure()

sciencedata.dk username (format '123456@au.dk'): 648597@au.dk
sciencedata.dk password: ········
endpoint variable has been configured to: https://sciencedata.dk/files/


In [5]:
# to access gsheet, you need Google Service Account key json file
# I have mine located in my personal space on sciencedata.dk, so I read it from there:

# (1) read the file and parse its content
file_data = conf[0].get(conf[1] + "ServiceAccountsKey.json").json()
# (2) transform the content into crendentials object
credentials = service_account.Credentials.from_service_account_info(file_data)
# (3) specify your usage of the credentials
scoped_credentials = credentials.with_scopes(['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive'])
# (4) use the constrained credentials for authentication of gspread package
gc = gspread.Client(auth=scoped_credentials)
# (5) establish connection with spreadsheets specified by their url
PIA_data = gc.open_by_url("https://docs.google.com/spreadsheets/d/1KxOx7Be9fj3lDcEPgQhQ-Iqcn9p367-MMD6RMXe8rks/edit?usp=sharing")
PIA_overview = gc.open_by_url("https://docs.google.com/spreadsheets/d/1e94wyelg6dftQ4zxbq1xvwxWAI-BhcYXtclDW-YTnrw/edit?usp=sharing")

In [6]:
publicfolder = "31b393e2afe1ee96ce81869c7efe18cb"
c_hippocraticum = sddk.read_file("c_hippocraticum_enriched.json", "df", publicfolder)

reading file located in a public folder


In [7]:
words_to_remove = ["ἢν", "ὁκόταν"]

c_hippocraticum["lemmatized_sentences_repl"] = c_hippocraticum["lemmatized_sentences_repl"].apply(lambda sentences_list:  [[word for word in sentence if word not in words_to_remove] for sentence in sentences_list])

words_to_replace = {
    "ἔχις" : "ἔχω"}

for key in words_to_replace.keys():
    c_hippocraticum["lemmatized_sentences_repl"] = c_hippocraticum["lemmatized_sentences_repl"].apply(lambda sentences_list: [[corrections[key] if x == key else x for x in sentence] for sentence in sentences_list])

# Import and preprocess corpus

In [8]:
c_hippocraticum.columns

Index(['filename', 'author', 'title', 'string', 'wordcount', 'author_id',
       'doc_id', 'raw_date', 'date_avr', 'date_probs', 'date_manual',
       'provenience', 'sentences', 'lemmata', 'lemmata_wordcount',
       'lemmatized_sentences', 'n_sentences', 'lemmata_repl',
       'lemmatized_sentences_repl', 'λύπ*', 'ἄλγ*', 'ὀδύν*', 'πόνο*',
       'terms_sum', 'λύπ*_TF', 'ἄλγ*_TF', 'ὀδύν*_TF', 'πόνο*_TF', 'TF_sum',
       'work_cat_jouanna', 'work_cat_craik', 'work_cat_linka'],
      dtype='object')

In [9]:
def get_flat_sentences(series):
    sentences_list = [sent for doc in series.tolist() for sent in doc]
    return sentences_list

In [10]:
# docs = sentences
docs = get_flat_sentences(c_hippocraticum["lemmatized_sentences_repl"])

In [11]:
print(docs[:10])

[['ὁπόσος', 'ἐπιχειρέω', 'ἰητρικῆς', 'λέγω', 'γράφω', 'ὑπόθεσις', 'λόγος', 'θερμός', 'ψυχρός', 'ὑγρός', 'ξηρός', 'ἐθέλω', 'βραχὺ', 'ἀρχην', 'αἰτία', 'ἄνθρωπος', 'νόσος', 'θάνατος', 'πᾶς', 'ἵημι', 'πολύς', 'λέγω', 'καταφανής', 'εἶμι', 'ἄξιος', 'μέμφομαι', 'τέχνη', 'χρέονταί', 'πᾶς', 'μέγας', 'τιμάω', 'ἀγαθοὺς', 'χειροτέχνης', 'δημιουργός'], ['εἶμι', 'δημιουργός', 'φαῦλος', 'πολύς'], ['ἰητρικη', 'ὅλοξ', 'μηδʼ', 'ἔσκεπτο', 'μηδʼ', 'εὑρίσκω', 'μηδείς', 'πᾶς', 'ὅμοιος', 'ἄπειροί', 'ἀνεπιστήμων', 'τύχη', 'πᾶς', 'κάμνω', 'διοικεῖτο'], ['ἔχω', 'τέχνη', 'πᾶς', 'δημιουργός', 'πολύς', 'ἀλλήλων', 'διαφέρω', 'χείρ', 'γνώμη', 'ἰητρικῆς'], ['ἀξιόω', 'αὐτην', 'κενός', 'ὑπόθεσις', 'δέω', 'ἀφανέα', 'ἀπορεόμενα', 'ἀνάγκη', 'ἐπιχειρέω', 'λέγω', 'ὑπόθεσις', 'χράομαι', 'οἷος', 'μετέωρος', 'γῆ'], ['εἶμι', 'λέγω', 'γιγνώσκω', 'ἔχω', 'οὔτʼ', 'λέγω', 'ἀκούω', 'δῆλος', 'ἐάω', 'ἀληθής'], ['χρῆ', 'ἀνενέγκαντα', 'οἶδα', 'σαφής'], ['ἰατρικός', 'πάλη', 'πᾶς', 'ὑπάρχω', 'ἀρχη', 'ὁδός', 'ἣν', 'πολύς', 'καλός', 'εὑρίσκω

In [12]:
c_hippocraticum["lemmata_repl"].apply(lambda x: len(x)).sum()

171233

In [13]:
# perhaps we will also explore our subcorpora at some point
subcorpora = {}
for cat in c_hippocraticum["work_cat_linka"].unique():
    subcorpora[cat] = get_flat_sentences(c_hippocraticum[c_hippocraticum["work_cat_linka"]==cat]["lemmatized_sentences_repl"])

for key in subcorpora.keys():
    print(key, len(subcorpora[key]))

Theoretical 2504
Other 16911
Practical 5044


In [14]:
#create gensim dictionary for our list of sentences
dictionary = corpora.Dictionary(docs)

there is a number of methods to be applied upon a dictionary object:
see: https://radimrehurek.com/gensim/corpora/dictionary.html

First of all, you can inspect it as a standard dictionary object


In [15]:
# uncomment below to print the whole dictionary
# dictionary

In [16]:
# as such, it is organized by ids
dict(list(dictionary.items())[:10])

{0: 'αἰτία',
 1: 'βραχὺ',
 2: 'γράφω',
 3: 'δημιουργός',
 4: 'εἶμι',
 5: 'θάνατος',
 6: 'θερμός',
 7: 'καταφανής',
 8: 'λέγω',
 9: 'λόγος'}

In [17]:
# but you can access it reveresly by applying token2id method
dict(list(dictionary.token2id.items())[:10])

{'αἰτία': 0,
 'βραχὺ': 1,
 'γράφω': 2,
 'δημιουργός': 3,
 'εἶμι': 4,
 'θάνατος': 5,
 'θερμός': 6,
 'καταφανής': 7,
 'λέγω': 8,
 'λόγος': 9}

In [20]:
# collection_frequencies
dictionary.cfs[5] # how many instances of word with id 5 (= "θάνατος")

80

In [21]:
dictionary.dfs[5] # how many documents contain the word with id 5 (= "θάνατος")

80

In [22]:
dictionary.num_pos

167358

In [23]:
len(dictionary.keys())

25059

In [25]:
sent = docs[20]
sent

['αὐτη',
 'ἀνάγκη',
 'ἰητρικην',
 'ποιέω',
 'ζητηθῆναί',
 'εὑρίσκω',
 'ἄνθρωπος',
 'κάμνω',
 'ταὐτάζω',
 'προσφερομένοισι',
 'συμφέρω',
 'συμφέρω']

In [27]:
# doc2bow = document to (term, tf) tuples, i.e. bag-of-words
dictionary.doc2bow(sent)

[(23, 1),
 (34, 1),
 (35, 1),
 (57, 1),
 (127, 1),
 (128, 2),
 (131, 1),
 (132, 1),
 (133, 1),
 (134, 1),
 (135, 1)]

# LSA with  sklearn

In [177]:
# we can use our gensim dictionary

# as a corpus, we cannot use the Gensim default BoW model,

# we just need the words replaced by values

corpus_bow = [dictionary.doc2bow(sent) for sent in sentences_list]
corpus_idx = [dictionary.doc2idx(sent) for sent in sentences_list]

In [180]:
len(corpus_idx)

24459

In [None]:
vec =  TfidfVectorizer(vocabulary=vocabulary) ### initiaze the model
X = vec.fit_transform([" ".join(sentence) for sentence in list_of_lists]) ### run the model
Xc = (X.T * X)
svd = TruncatedSVD(n_components=25, n_iter=5, random_state=42)
svd.fit(Xc)
cooc = pd.DataFrame(Xc.toarray(), columns=vec.get_feature_names(), index=vec.get_feature_names())
lsa_model_data = pd.DataFrame(svd.components_, columns=vec.get_feature_names())


def get_most_similar(model_df, target_term, number):
  all_similar = []
  for term in model_df.columns:
    similarity = (term, cosine_similarity([model_df[target_term],  model_df[term]])[0][1])
    all_similar.append(similarity)
  return sorted(all_similar, key=lambda number: number[1], reverse=True)[1:number]

model_john_lsa, cooc_john = lsa_model(sentences_john, words_john)



In [None]:
lsamodel, cooc = lsa_model( vocabulary=)

