In [None]:
#TF-IDF processing
# via the Steven Loria tutorials: http://stevenloria.com/finding-important-words-in-a-document-using-tf-idf/

import math
from textblob import TextBlob as tb

def tf(word, blob):
    return blob.words.count(word) / len(blob.words)

def n_containing(word, bloblist):
    return sum(1 for blob in bloblist if word in blob.words)

def idf(word, bloblist):
    return math.log(len(bloblist) / (1 + n_containing(word, bloblist)))

def tfidf(word, blob, bloblist):
    return tf(word, blob) * idf(word, bloblist)

bloblist = []
for i, blob in enumerate(bloblist):
    print("Top words in document {}".format(i + 1))
    scores = {word: tfidf(word, blob, bloblist) for word in blob.words}
    sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    for word, score in sorted_words[:3]:
        print("\tWord: {}, TF-IDF: {}".format(word, round(score, 5)))

In [None]:
FinalHelen= tb("""νεῖλος μὲν ὅδε καλλιπάρθενος ῥοή ὃς ἀντὶ δῖος ψακάς αἴγυπτος πέδον λευκός τήκω χιών ὑγραίνω ἐγγυάω Πρωτεὺς δέ ὅστις ζάω ὅδε γαῖα τύραννος εἰμί φάρος μὲν οἰκέω νέω αἴγυπτος δέ ἄναξ ὃς ὁ κατά οἶδμα παρθένος εἷς γαμέω ψαμάθη ἐπειδὴ λέκτρον ἀφίημι αἰακός τίκτω δὲ τέκνον δισσὰ ὅδε ἐν δῶμα θεοκλύμενος ἄρσην ὅστις δὴ θεοὺς σέβω βιόω διαφέρω εὐγενής τε παρθένος Εἰδώ τὸ μητρὸς ἀγλάισμα ὅστις εἰμί βρέφος ἐπεὶ δέ εἰς ἅπτω ἔρχομαι ὡραῖος γάμος καλέω αὐτὴν Θεονόην τὰ θεῖος γὰρ ὁ σύ εἰμί καὶ μέλλω πᾶς ἐπίσταμαι πρόγονος λαμβάνω νηρεύς τιμὰς παρά ἐγώ δὲ γαῖα μὲν πατρὶς οὐ ἀνώνυμος σπάρτη πατὴρ δὲ Τυνδάρεως εἰμί δὲ δὴ λόγος τις ὡς Ζεὺς μήτηρ πέτομαι εἰς ἐμὴν λήδη κύκνος μόρφωμα ὄρνις λαμβάνω ὃς δόλιος εὐνὴν ἐκπράσσω ὑπό ἀετός δίωγμα φεύγω εἰ σαφὴς οὗτος λόγος ἑλένη δέ καλέω ἃ δὲ πάσχω κακὰ λέγω ἀνά ἔρχομαι τρέω θεαὶ κάλλος περί ἰδαῖος εἰς κευθμών ἀλέξανδρος παρά ἥρα κύπρις τε διογενής τε παρθένος μορφάω ἐθέλω διαπεραίνω κρίσις τοὐμὸν δὲ κάλλος εἰ καλὸν τὸ δυστυχής κύπρις προτείνασ ὡς ἀλέξανδρος γαμέω νικάω λιπὼν δὲ βούσταθμον ἰδαῖος πάρις σπάρτη ἀφικνέομαι ὡς ἐμὸν ἔχω λέχος ἥρα δὲ μεμφθεῖσ οὕνεκα οὐ νικάω θεὰς ἐξανεμόω ἐγώ ἀλέξανδρος λέχος δίδωμι δέ οὐ ἐγώ ἀλλά ὁμοιώσασ ἐμοὶ εἴδωλον ἐμπνέω οὐρανός ξυνθεῖσ ἀπό πρίαμος τύραννος παιδί καὶ δοκέω ἐγώ ἔχω κενὴν δόκησις οὐ ἔχω τὰ δέ αὖ Διὸς βούλευμα ἄλλος ὅδε συμβαίνω κακοῖς πόλεμος γὰρ εἰσφέρω ἕλλην χθονὶ καὶ Φρυξὶ δύστηνος ὡς ὄχλος βροτός πλῆθος τε κουφίζω μήτηρ χθών γνωτός τε τίθημι τὸν κράτιστος ἑλλάς φρύξ δέ εἰς ἀλκὴν προτίθημι ἐγὼ μὲν οὐ τὸ δέ ὄνομα ἐμός ἆθλος ἕλλην δόρυ λαβὼν δέ ἐγώ ἑρμῆς ἐν πτυχή αἰθήρ νεφέλη καλύψας—οὐ γὰρ ἠμέλησέ ἐγώ Ζεύς—τόνδ εἰς οἶκος πρωτεύς ἱδρύω πᾶς προκρίνω σώφρων βροτός ἀκέραιος ὡς σώζω μενέλαος λέχος κἀγὼ μὲν ἐνθάδε εἰμί ὁ δέ ἄθλιος πόσις στράτευμα ἀθροίζω τὰς ἐμὰς ἀναρπαγὰς θηράω πορευθεὶς ἴλιος πύργωμα ψυχαὶ δὲ πολλαὶ δι ἐγώ ἐπὶ σκαμάνδριος ῥοή ἔθανον ὁ δὲ πᾶς τλᾶσ ἐγὼ κατάρατός εἰμί καὶ δοκέω προδοῦσ ἐμὸν πόσις συνάπτω πόλεμος ἕλλην μέγας τίς οὖν ἔτι ζάω θεάομαι ὅδε εἰσήκουσ ἔπος ἑρμῆς τὸ κλεινὸν ἔτι κατοικέω πέδον σπάρτη σὺν ἀνήρ γιγνώσκω ὡς εἰς ἴλιος οὐ ἔρχομαι ἢν μὴ λέκτρον ὑπόστόρεννυμι τις""")

In [None]:
FinalOrestes= tb("""οὐ εἰμί οὐδὲν δεινὸν ὧδε εἶπον ἔπος οὐδὲ πάθος οὐδὲ ξυμφορὰ θεήλατος ὅς οὐ ἂν αἴρω ἄχθος ἄνθρωπος φύσις ὁ γὰρ μακάριος Διὸς φύω ὡς λέγω τάνταλος κορυφή ὑπερτέλλω δειμαίνω πέτρος ἀήρ ποτᾶται καὶ τίνω οὗτος δίκη ὡς μὲν λέγω ὅστις θεάω ἄνθρωπος εἰμί κοινόω τράπεζα ἀξιόω ἔχω ἴσος ἀκόλαστος ἔχω γλῶσσα αἰσχρός νόσος οὗτος φυτεύω πέλοψ ὁ δέ Ἀτρεὺς φύω ὅς στέμμα ξήνασ ἐπικλώθω θεὰ ἔρις θυέστης πόλεμος εἰμί σύγγονος θέσθαι τίς ἐρῶ ἀναμετρήσασθαί ἐγώ δέω δαίνυμι δέ οὖν νιν τέκνον ἀποκτείνω ἀτρεύς ἀτρεύς δέ ὁ κλεινός εἰ δὴ κλεινός ἀγαμέμνων φύω Μενέλεώς τε κρής μητρὸς ἀερόπη ἀπό γαμέω δέ ὁ μὲν δὴ τὴν θεάω στυγέω μενέλαος ἑλένη ὁ δὲ κλυταιμνήστρα λέχος ἐπίσημος εἰς ἕλλην ἀγαμέμνων ἄναξ ὅς παρθένος μὲν τρέω φύω ἐκ εἷς χρυσόθεμις Ἰφιγένειά σύ ἠλέκτρη σύ ἐγώ ἄρσην δέ ὀρέστης μητρὸς ἀνόσιος ἣ πόσις ἄπειρος περιβαλοῦσ ὕφασμα ἔκτεινεν ὅς δέ ἕκητι παρθένος λέγω οὐ καλόν ἐάω οὗτος ἀσαφὲς ἐν κοινόω σκοπάω φοῖβος δέ ἀδικία μὲν τίς δέω κατηγορέω πείθω δέ ὀρέστης μήτηρ ὁ ἕπομαι γείνομαι κτείνω πρὸς οὐ ἅπας εὔκλεια φέρω ὁμόω δέ ἀποκτείνω οὐ ἀπειθέω θεάω κἀγὼ μετέχω οἷος δὴ γυνὴ φονόω πυλάδης σύ ὃς ἐγώ συγκατεργάζομαι ὅδε ἐντεῦθεν ἄγριος συντακεὶς †νόσωι νοσέω τλήμων ὀρέστης ὅδε πεσὼν ἐν δέμνιον κεῖμαι τὸ μητρὸς δέ αἷμά νιν τροχηλατέω μανίαισιν ὀνομάζω γὰρ αἰδέομαι θεὰς εὐμενίδες αἳ ὅδε ἐξαμιλλάομαι φόβος ἕκτος δὲ δὴ ὅδε ἦμαρ ἐκ ὅστις σφαγή θνήσκω μήτηρ πυρὶ καθαγνίζω δέμας ὅς οὔτε σῖτος διὰ δειρή δείκνυμι οὐ λουτρόν δίδωμι χρωτί χλανίδιον δέ ἔσω κρύπτω ὅταν μὲν σῶμα κουφίζω νόσος ἔμφρων δακρύω ποτὲ δὲ δέμνιον ἀπό πηδάω δρομαῖος πῶλος ὣς ὑπὸ ζυγόω δοκέω δέ ἀργέω ὅδε μήτις ἐγώ στέγη μὴ πυρὶ δέχομαι μηδὲ προσφωνέω τις μητροκτονοῦντας κύριος δέ ὅδε ἥμερος ἐν ἵημι διαφέρω ψῆφος ἀργεῖος πόλις εἰ χρὴ θνήσκω νὼ λεύσιμος πέτρωμα ἢ φάσγανον θήγω ἐπί αὐχήν βάλλω ἐλπίς δὲ δή τις ἔχω ὥστε μὴ θανεῖν ἥκω γὰρ εἰς γαῖα μενέλαος τροία ἀπό λιμήν δὲ ναυπλιεύς ἐκπληρόω πλάτης ἀκτή ὁρμάω δαρὸν ἐκ τροία χρόνος ἄλη πλαγχθείς τὴν δὲ δὴ πολυκτόνος ἑλένη φυλάσσω νύξ μή τις εἰσιδὼν μετά ἥμερος στείχω ὅς ὑπό ἴλιος παῖς θνήσκω εἰς πετρόω ἔρχομαι βολή""")

In [None]:
bloblist = [FinalHelen, FinalOrestes]
for i, blob in enumerate(bloblist):
    print("Top words in document {}".format(i + 1))
    scores = {word: tfidf(word, blob, bloblist) for word in blob.words}
    sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    for word, score in sorted_words[:3]:
        print("\tWord: {}, TF-IDF: {}".format(word, round(score, 5)))

# DTM (Document Term Matrix) and Visualizing Similarities

# Author Similarities

In [None]:
#term matrices and vectoring
#following code is a combo of the cltk vectorizer and Allen Riddell's vectorizers and plotting
#https://de.dariah.eu/tatom/working_with_text.html#creating-a-document-term-matrix

import numpy as np  # a conventional alias
import os  # for os.path.basename
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.manifold import MDS
from sklearn.feature_extraction.text import CountVectorizer

filenames = [
'/Users/wimsey/cltk_data/tmp/aeschylus/aeschylus_author_bag_of_words.txt',
'/Users/wimsey/cltk_data/tmp/aristophanes/aristophanes_author_bag_of_words.txt',
'/Users/wimsey/cltk_data/tmp/euripides/euripides_author_bag_of_words.txt',
'/Users/wimsey/cltk_data/tmp/sophocles/sophocles_author_bag_of_words.txt'
]
vectorizer = CountVectorizer(input='filename', decode_error='ignore', strip_accents=None, lowercase=False)
dtm = vectorizer.fit_transform(filenames)  # a sparse matrix

vocab = vectorizer.get_feature_names()  # a list

dtm = dtm.toarray()  # convert to a regular array
vocab = np.array(vocab)

# short versions of filenames:
names = [os.path.basename(fn).replace('.txt', '') for fn in filenames]

# Euclidean
from sklearn.metrics.pairwise import euclidean_distances
dist = euclidean_distances(dtm)
np.round(dist, 1)

# two components as we're plotting points in a two-dimensional plane
# "precomputed" because we provide a distance matrix
# we will also specify `random_state` so the plot is reproducible.
mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1)
pos = mds.fit_transform(dist)  # shape (n_components, n_samples)

xs, ys = pos[:, 0], pos[:, 1]

# color-blind-friendly palette
for x, y, name in zip(xs, ys, names):
     color = 'orange'
     plt.scatter(x, y, c=color)
     plt.text(x, y, name.split("_")[0])

plt.show()

In [None]:
import numpy as np  # a conventional alias
import os  # for os.path.basename
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.manifold import MDS
from sklearn.feature_extraction.text import CountVectorizer

filenames = [
'/Users/wimsey/cltk_data/tmp/aeschylus/aeschylus_author_bag_of_words.txt',
'/Users/wimsey/cltk_data/tmp/aristophanes/aristophanes_author_bag_of_words.txt',
'/Users/wimsey/cltk_data/tmp/euripides/euripides_author_bag_of_words.txt',
'/Users/wimsey/cltk_data/tmp/sophocles/sophocles_author_bag_of_words.txt'
]

vectorizer = CountVectorizer(input='filename', decode_error='ignore', strip_accents=None, lowercase=False)
dtm = vectorizer.fit_transform(filenames)  # a sparse matrix

vocab = vectorizer.get_feature_names()  # a list

dtm = dtm.toarray()  # convert to a regular array
vocab = np.array(vocab)

# short versions of filenames:
names = [os.path.basename(fn).replace('.txt', '') for fn in filenames]

# Cosine Similarity
from sklearn.metrics.pairwise import cosine_similarity
dist = 1 - cosine_similarity(dtm)
np.round(dist, 2)

# two components as we're plotting points in a two-dimensional plane
# "precomputed" because we provide a distance matrix
# we will also specify `random_state` so the plot is reproducible.
mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1)
pos = mds.fit_transform(dist)  # shape (n_components, n_samples)

xs, ys = pos[:, 0], pos[:, 1]

# color-blind-friendly palette
for x, y, name in zip(xs, ys, names):
     color = 'skyblue'
     plt.scatter(x, y, c=color)
     plt.text(x, y, name.split("_")[0])

plt.show()

In [None]:
# all of Aeschylus compared
filenames=[aesch_ag,
           aesch_eum,
           aesch_lib,
           aesch_pb,
           aesch_pers,
           aesch_seven,
           aesch_supp]

vectorizer = CountVectorizer(input='filename', decode_error='ignore', strip_accents=None, lowercase=False)
dtm = vectorizer.fit_transform(filenames)  # a sparse matrix

vocab = vectorizer.get_feature_names()  # a list

dtm = dtm.toarray()  # convert to a regular array
vocab = np.array(vocab)

# short versions of filenames:
names = [os.path.basename(fn).replace('.txt', '') for fn in filenames]

# Cosine Similarity
from sklearn.metrics.pairwise import cosine_similarity
dist = 1 - cosine_similarity(dtm)
np.round(dist, 2)

# two components as we're plotting points in a two-dimensional plane
# "precomputed" because we provide a distance matrix
# we will also specify `random_state` so the plot is reproducible.
mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1)
pos = mds.fit_transform(dist)  # shape (n_components, n_samples)

xs, ys = pos[:, 0], pos[:, 1]

# color-blind-friendly palette
for x, y, name in zip(xs, ys, names):
     color = 'skyblue'
     plt.scatter(x, y, c=color)
     plt.text(x, y, name.split("_")[0])

plt.show()

In [None]:
#all of Sophocles compare
filenames=[soph_aj,
           soph_ant,
           soph_el,
           soph_oc,
           soph_ot,
           soph_phil,
           soph_trach]
vectorizer = CountVectorizer(input='filename', decode_error='ignore', strip_accents=None, lowercase=False)
dtm = vectorizer.fit_transform(filenames)  # a sparse matrix

vocab = vectorizer.get_feature_names()  # a list

dtm = dtm.toarray()  # convert to a regular array
vocab = np.array(vocab)

# short versions of filenames:
names = [os.path.basename(fn).replace('.txt', '') for fn in filenames]

# Cosine Similarity
from sklearn.metrics.pairwise import cosine_similarity
dist = 1 - cosine_similarity(dtm)
np.round(dist, 2)

# two components as we're plotting points in a two-dimensional plane
# "precomputed" because we provide a distance matrix
# we will also specify `random_state` so the plot is reproducible.
mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1)
pos = mds.fit_transform(dist)  # shape (n_components, n_samples)

xs, ys = pos[:, 0], pos[:, 1]

# color-blind-friendly palette
for x, y, name in zip(xs, ys, names):
     color = 'skyblue'
     plt.scatter(x, y, c=color)
     plt.text(x, y, name.split("_")[0])

plt.show()

In [None]:
# all of the available Euripides compare
filenames=[eur_ba,
           eur_el,
           eur_hec,
           eur_her,
           eur_ia,
           eur_ion,
           eur_it,
           eur_orest,
           eur_phoen,
           eur_rh,
           eur_supp,
           eur_tro]
vectorizer = CountVectorizer(input='filename', decode_error='ignore', strip_accents=None, lowercase=False)
dtm = vectorizer.fit_transform(filenames)  # a sparse matrix

vocab = vectorizer.get_feature_names()  # a list

dtm = dtm.toarray()  # convert to a regular array
vocab = np.array(vocab)

# short versions of filenames:
names = [os.path.basename(fn).replace('.txt', '') for fn in filenames]

# Cosine Similarity
from sklearn.metrics.pairwise import cosine_similarity
dist = 1 - cosine_similarity(dtm)
np.round(dist, 2)

# two components as we're plotting points in a two-dimensional plane
# "precomputed" because we provide a distance matrix
# we will also specify `random_state` so the plot is reproducible.
mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1)
pos = mds.fit_transform(dist)  # shape (n_components, n_samples)

xs, ys = pos[:, 0], pos[:, 1]

# color-blind-friendly palette
for x, y, name in zip(xs, ys, names):
     color = 'skyblue'
     plt.scatter(x, y, c=color)
     plt.text(x, y, name.split("_")[0])

plt.show()

In [None]:
#all available extant tragedy
filenames=[eur_ba,
           eur_el,
           eur_hec,
           eur_her,
           eur_ia,
           eur_ion,
           eur_it,
           eur_orest,
           eur_phoen,
           eur_rh,
           eur_supp,
           eur_tro,
           soph_aj,
           soph_ant,
           soph_el,
           soph_oc,
           soph_ot,
           soph_phil,
           soph_trach,
           aesch_ag,
           aesch_eum,
           aesch_lib,
           aesch_pb,
           aesch_pers,
           aesch_seven,
           aesch_supp]

vectorizer = CountVectorizer(input='filename', decode_error='ignore', strip_accents=None, lowercase=False)
dtm = vectorizer.fit_transform(filenames)  # a sparse matrix

vocab = vectorizer.get_feature_names()  # a list

dtm = dtm.toarray()  # convert to a regular array
vocab = np.array(vocab)

# short versions of filenames:
names = [os.path.basename(fn).replace('.txt', '') for fn in filenames]

# Cosine Similarity
from sklearn.metrics.pairwise import cosine_similarity
dist = 1 - cosine_similarity(dtm)
np.round(dist, 2)

# two components as we're plotting points in a two-dimensional plane
# "precomputed" because we provide a distance matrix
# we will also specify `random_state` so the plot is reproducible.
mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1)
pos = mds.fit_transform(dist)  # shape (n_components, n_samples)

xs, ys = pos[:, 0], pos[:, 1]

# Create a figure of size 16x12 inches, 80 dots per inch
plt.figure(figsize=(16, 12), dpi=80)

# color-blind-friendly palette
for x, y, name in zip(xs, ys, names):
     color = 'skyblue'
     plt.scatter(x, y, c=color)
     plt.text(x, y, name.split("_")[0])

plt.show()

In [None]:
#all Aristophanes compared
filenames=[aristoph_ach,
           aristoph_birds,
           aristoph_cl,
           aristoph_eccl,
           aristoph_frogs,
           aristoph_kn,
           aristoph_lys,
           aristoph_wealth,
           aristoph_peace,
           aristoph_thesmo,
           aristoph_wasps]
vectorizer = CountVectorizer(input='filename', decode_error='ignore', strip_accents=None, lowercase=False)
dtm = vectorizer.fit_transform(filenames)  # a sparse matrix

vocab = vectorizer.get_feature_names()  # a list

dtm = dtm.toarray()  # convert to a regular array
vocab = np.array(vocab)

# short versions of filenames:
names = [os.path.basename(fn).replace('.txt', '') for fn in filenames]

# Cosine Similarity
from sklearn.metrics.pairwise import cosine_similarity
dist = 1 - cosine_similarity(dtm)
np.round(dist, 2)

# two components as we're plotting points in a two-dimensional plane
# "precomputed" because we provide a distance matrix
# we will also specify `random_state` so the plot is reproducible.
mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1)
pos = mds.fit_transform(dist)  # shape (n_components, n_samples)

xs, ys = pos[:, 0], pos[:, 1]

# color-blind-friendly palette
for x, y, name in zip(xs, ys, names):
     color = 'skyblue'
     plt.scatter(x, y, c=color)
     plt.text(x, y, name.split("_")[0])

plt.show()

In [None]:
#all dramatists
filenames=[eur_ba,
           eur_el,
           eur_hec,
           eur_her,
           eur_ia,
           eur_ion,
           eur_it,
           eur_orest,
           eur_phoen,
           eur_rh,
           eur_supp,
           eur_tro,
           soph_aj,
           soph_ant,
           soph_el,
           soph_oc,
           soph_ot,
           soph_phil,
           soph_trach,
           aesch_ag,
           aesch_eum,
           aesch_lib,
           aesch_pb,
           aesch_pers,
           aesch_seven,
           aesch_supp,
           aristoph_ach,
           aristoph_birds,
           aristoph_cl,
           aristoph_eccl,
           aristoph_frogs,
           aristoph_kn,
           aristoph_lys,
           aristoph_wealth,
           aristoph_peace,
           aristoph_thesmo,
           aristoph_wasps]

vectorizer = CountVectorizer(input='filename', decode_error='ignore', strip_accents=None, lowercase=False)
dtm = vectorizer.fit_transform(filenames)  # a sparse matrix

vocab = vectorizer.get_feature_names()  # a list

dtm = dtm.toarray()  # convert to a regular array
vocab = np.array(vocab)

# short versions of filenames:
names = [os.path.basename(fn).replace('.txt', '') for fn in filenames]

# Cosine Similarity
from sklearn.metrics.pairwise import cosine_similarity
dist = 1 - cosine_similarity(dtm)
np.round(dist, 2)

# two components as we're plotting points in a two-dimensional plane
# "precomputed" because we provide a distance matrix
# we will also specify `random_state` so the plot is reproducible.
mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1)
pos = mds.fit_transform(dist)  # shape (n_components, n_samples)

xs, ys = pos[:, 0], pos[:, 1]
# Create a figure of size 16x12 inches, 80 dots per inch
plt.figure(figsize=(16, 12), dpi=80)

# color-blind-friendly palette
for x, y, name in zip(xs, ys, names):
     color = 'skyblue'
     plt.scatter(x, y, c=color)
     plt.text(x, y, name.split("_")[0])

plt.show()

In [None]:
import numpy as np  # a conventional alias
import os  # for os.path.basename
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.manifold import MDS
from sklearn.feature_extraction.text import CountVectorizer
import glob
tmp_path = '/Users/wimsey/cltk_data/tmp/'

# Euripides and Aeschylus collection of plays and their words
euripides_plays_vocabulary = 'euripides/eur.*_gk_vocab.txt' # list of all plays bag of word files
aeschylus_plays_vocabulary = 'aeschylus/aesch.*_gk_vocab.txt' # list of all plays bag of word files

#Characters in Aristophanes Frogs:
#aristophanes_frogs_vocabulary = aristophanes/aristoph.frogs_gk_vocab.txt # frogs play bag of words
#aristophanes/aristoph.frogs_gk_files.txt # list of all files for all lines of play (The file name has the speaker and starting and ending line number)

Aeschylus = ["αἰσχύλος"]
aristophanes_aeschylus = 'aristophanes/αἰσχύλος/aristoph.frogs_gk_αἰσχύλος_vocab.txt'
# example of one of the lines from frogs filename: aristoph.frogs_gk_αἰσχύλος_1006_1008.txt

Euripides = ["εὐριπίδης"]
aristophanes_euripides = 'aristophanes/εὐριπίδης/aristoph.frogs_gk_εὐριπίδης_vocab.txt'
# example of one of the lines from frogs filename: aristoph.frogs_gk_εὐριπίδης_start_end.txt

corpus = list()

# Add List of all Euripides plays bag of word files
for f in glob.glob(os.path.join(tmp_path, euripides_plays_vocabulary)):
    play = os.path.split(f)[1]
    play = play.replace("_gk_vocab.txt","")
    corpus.append(dict(name=play, filename=f))

# Add List of all Aeschylus plays bag of word files
for f in glob.glob(os.path.join(tmp_path, aeschylus_plays_vocabulary)):
    play = os.path.split(f)[1]
    play = play.replace("_gk_vocab.txt","")
    corpus.append(dict(name=play, filename=f))

# Euripides and Aeschylus from frogs
corpus.append(dict(name="Aeschylusfrogs", filename=os.path.join(tmp_path, aristophanes_aeschylus)))
corpus.append(dict(name="Euripidesfrogs", filename=os.path.join(tmp_path, aristophanes_euripides)))

filenames = list()
names = list()
for d in corpus:
#    print("{}:\t{}".format(d['name'], d['filename']))
    filenames.append(d['filename'])
    names.append(d['name'])

vectorizer = CountVectorizer(input='filename', decode_error='ignore', strip_accents=None, lowercase=False)
dtm = vectorizer.fit_transform(filenames)  # a sparse matrix

vocab = vectorizer.get_feature_names()  # a list

dtm = dtm.toarray()  # convert to a regular array
vocab = np.array(vocab)

# Cosine Similarity
from sklearn.metrics.pairwise import cosine_similarity
dist = 1 - cosine_similarity(dtm)
np.round(dist, 2)

# two components as we're plotting points in a two-dimensional plane
# "precomputed" because we provide a distance matrix
# we will also specify `random_state` so the plot is reproducible.
mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1)
pos = mds.fit_transform(dist)  # shape (n_components, n_samples)

xs, ys = pos[:, 0], pos[:, 1]

# Create a figure of size 16x12 inches, 80 dots per inch
plt.figure(figsize=(16, 12), dpi=80)

# color-blind-friendly palette
for x, y, name in zip(xs, ys, names):
#     color = 'orange' if "aeschylus" in name else 'skyblue'
     plt.scatter(x, y, c=color)
     plt.text(x, y, name)


_ = plt.show()

In [None]:
# après Jeremy M. Stober, Tim Vieira
# https://github.com/timvieira/viz/blob/master/mds.py
mds = MDS(n_components=3, dissimilarity="precomputed", random_state=1)
pos = mds.fit_transform(dist)
from mpl_toolkits.mplot3d import Axes3D

fig = plt.figure(figsize=(16,10))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(pos[:, 0], pos[:, 1], pos[:, 2])

for x, y, z, s in zip(pos[:, 0], pos[:, 1], pos[:, 2], name):
    ax.text(x, y, z, s, fontsize=20)

_ = plt.show()

In [None]:
from scipy.cluster.hierarchy import ward, dendrogram

linkage_matrix = ward(dist)

# match dendrogram to that returned by R's hclust()
dendrogram(linkage_matrix, orientation="right", labels=names)