In [9]:
import os
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import gensim
import plotly.express as px
from sklearn.manifold import TSNE
tsne = TSNE(2)

# Data

In [1]:
def get_fnames(cts_list):
  author_by_cts = []
  for cts in cts_list:
    author_by_cts += [i for i  in os.walk(f'./data/{cts}')][1:]

  f_names = []
  for dir in author_by_cts:
    for f_name in dir[2]:
      if ('grc2' in f_name) and ('json' not in f_name):
        f_names.append(f"{dir[0]}/{f_name}")
      elif ('grc1' in f_name) and ('json' not in f_name):
        f_names.append(f"{dir[0]}/{f_name}")

  return f_names

In [6]:
ctses = list(os.walk('./data'))[0][1]
f_names = get_fnames(ctses)

In [84]:
def get_title_author_text(f_name):
  soup = BeautifulSoup(open(f_name), features='xml')
  if len(soup.find_all('p')) > 3:
    try:
      text = [' '.join(p.text.split('.')[1:]).strip().replace('\n             ','').replace('\n     ','') for p in soup.find_all('p')[1:]]
    except IndexError:
      text = [p.text.strip() for p in soup.find_all('p')[2:]]
    
    if 'New Testament' in soup.title.text:
      return soup.title.text, 'New Testament', text
    elif 'Greek Anthology' not in soup.title.text:
      return soup.title.text, soup.author.text, text
  else:
    text = [l.text for l in soup.find_all('l')]

    if 'Greek Anthology' not in soup.title.text:
      return soup.title.text, soup.author.text, text

In [102]:
greek_dict = {}
for f_name in f_names:
  greek_dict[f_name] = get_title_author_text(f_name)

In [103]:
g = pd.DataFrame.from_dict(greek_dict, orient='index').reset_index().rename(columns={'index':'filename',0:'title',1:'author',2:'text_list'})
g = g.dropna()

In [104]:
g

Unnamed: 0,filename,title,author,text_list
0,./data/tlg0057/tlg010/tlg0057.tlg010.perseus-g...,On the Natural Faculties.,Galen,[εἰ δέ τις καὶ τοῖς \nφυτοῖς ψυχῆς μεταδίδωσι ...
1,./data/tlg0059/tlg009/tlg0059.tlg009.perseus-g...,Parmenides,Plato,"[ἀλλὰ μὲν δή, εἶπον ἐγώ, πάρειμί γε ἐπʼ αὐτὸ τ..."
2,./data/tlg0059/tlg036/tlg0059.tlg036.perseus-g...,Epistles,Plato,"[, διατρίψας ἐγὼ παρʼ ὑμῖν χρόνον τοσοῦτον καὶ..."
3,./data/tlg0059/tlg031/tlg0059.tlg031.perseus-g...,Timaeus,Plato,"[εἷς, δύο, τρεῖς· ὁ δὲ δὴ τέταρτος ἡμῖν, ὦ φίλ..."
4,./data/tlg0059/tlg007/tlg0059.tlg007.perseus-g...,Sophist,Plato,"[κατὰ τὴν χθὲς ὁμολογίαν, ὦ Σώκρατες, ἥκομεν α..."
...,...,...,...,...
763,./data/tlg0014/tlg019/tlg0014.tlg019.perseus-g...,περὶ τῆς παραπρεσβείας,Demosthenes,"[δεήσομαι δὲ πάντων ὑμῶν, ἃ καὶ τοῖς μὴ δεηθεῖ..."
764,./data/tlg0014/tlg021/tlg0014.tlg021.perseus-g...,κατὰ Μειδίου περὶ τοῦ Κονδύλου,Demosthenes,"[ἐγὼ δʼ, ὅπερ ἂν καὶ ὑμῶν ἕκαστος ὑβρισθεὶς πρ..."
765,./data/tlg0014/tlg017/tlg0014.tlg017.perseus-g...,περὶ τῶν πρὸς Ἀλέξανδρον συνθηκῶν,Demosthenes,[δεῖ\n\t\t\t\t\tτοίνυν τοὺς λίαν ἐπʼ αὐτὰ παρα...
766,./data/tlg0014/tlg028/tlg0014.tlg028.perseus-g...,κατὰ Ἀφόβου β΄,Demosthenes,"[εἶπεν γὰρ ὡς ὁ πάππος ὤφειλε τῷ δημοσίῳ, καὶ ..."


# Doc2Vec

In [105]:
#from: https://radimrehurek.com/gensim/auto_examples/tutorials/run_doc2vec_lee.html
def read_corpus(text_lists,tokens_only=False):
  for i, text_list in enumerate(text_lists):
    text = ' '.join(text_list)
    tokens = gensim.utils.simple_preprocess(text)
    if tokens_only:
      yield tokens
    else:
      yield gensim.models.doc2vec.TaggedDocument(tokens, [i])

In [111]:
corpus = list(read_corpus(g.text_list.to_list()))

In [112]:
model = gensim.models.doc2vec.Doc2Vec(vector_size=100, min_count=2, window=10, epochs=40)

In [113]:
model.build_vocab(corpus)

In [114]:
model.wv.get_vecattr('ἀγαθοί','count')

66

In [116]:
model.train(corpus, total_examples=model.corpus_count, epochs=model.epochs)

In [117]:
vec_list = np.asarray([model.dv.get_vector(d) for d in model.dv.index_to_key])
_tsne = tsne.fit_transform(vec_list)

In [128]:
df = pd.DataFrame(list(zip(_tsne, model.dv.index_to_key)))
df['x'] = df[0].apply(lambda x: x[0])
df['y'] = df[0].apply(lambda x: x[1])
df = df.drop([0],axis=1)
df = df.rename(columns={1:'doc_index'})
df['title'] = g.title
df['author'] = g.author
#df['size'] = g.text_list.apply(lambda x: len(x))
df = df.dropna()

fig = px.scatter(df, x='x',y='y', color='author', hover_data=['title'])
fig.show()

In [119]:
model.dv.save('grc_d2v.kv')

In [123]:
df.to_csv('grc_df.csv')

In [155]:
df.loc[df.author.str.contains('New Testament|Aristophanes')]

Unnamed: 0,doc_index,x,y,title,author
220,220,12.597557,29.217722,Frogs,Aristophanes
221,221,11.614311,29.586895,Lysistrata,Aristophanes
222,222,11.342513,29.459852,Acharnians,Aristophanes
223,223,10.427096,29.64887,Birds,Aristophanes
224,224,11.935334,29.625521,Thesmophoriazusae,Aristophanes
225,225,15.841143,27.380346,Clouds,Aristophanes
226,226,15.840292,27.380499,Clouds,Aristophanes
227,227,11.526772,28.642685,Wasps,Aristophanes
228,228,10.987744,28.806728,Peace,Aristophanes
229,229,11.718765,28.441807,Knights,Aristophanes


In [146]:
title2index = dict(zip(g.title.to_list(),model.dv.index_to_key))
index2title = dict(zip(model.dv.index_to_key,g.title.to_list()))

In [172]:
import pickle

pickle.dump(title2index, open('title2index.p','wb'))
pickle.dump(index2title, open('index2title.p','wb'))

In [170]:
import re
def get_urn_cts(fname):
    return re.search(r"(tlg\d+\.tlg\d+)",fname)[0]

In [177]:
res = model.dv.most_similar(title2index['New Testament - Romans'], topn=30)
[(g.iloc[tup[0]]['title'], f'http://data.perseus.org/texts/urn:cts:greekLit:{get_urn_cts(g.iloc[tup[0]]["filename"])}',tup[1]) for tup in res] # (tlg\d+\.tlg\d+)

[('New Testament - 2 Corinthians',
  'http://data.perseus.org/texts/urn:cts:greekLit:tlg0031.tlg008',
  0.8035832643508911),
 ('New Testament - 1 Corinthians',
  'http://data.perseus.org/texts/urn:cts:greekLit:tlg0031.tlg007',
  0.7986435890197754),
 ('New Testament - 1 John',
  'http://data.perseus.org/texts/urn:cts:greekLit:tlg0031.tlg023',
  0.7570983171463013),
 ('New Testament - James',
  'http://data.perseus.org/texts/urn:cts:greekLit:tlg0031.tlg020',
  0.7469120025634766),
 ('Βαρνάβα ἐπιστολή',
  'http://data.perseus.org/texts/urn:cts:greekLit:tlg1216.tlg001',
  0.7348121404647827),
 ('New Testament - 1 Peter',
  'http://data.perseus.org/texts/urn:cts:greekLit:tlg0031.tlg021',
  0.7182907462120056),
 ('New Testament - Acts',
  'http://data.perseus.org/texts/urn:cts:greekLit:tlg0031.tlg005',
  0.7172637581825256),
 ('New Testament - John',
  'http://data.perseus.org/texts/urn:cts:greekLit:tlg0031.tlg004',
  0.694157063961029),
 ('New Testament - Luke',
  'http://data.perseus.org/

In [163]:
re.search('(tlg\d+\.tlg\d+)', g.iloc[0]['filename'])[0]

'tlg0057.tlg010'

In [175]:
g_notext = g.drop(['text_list'],axis=1)
g_notext.to_csv('grc_perseus_notext.csv')