In [1]:
import csv
import math
import string
import unicodedata

from collections import Counter

import nltk

import matplotlib.pyplot as plt

from cltk.stops.grc import STOPS
from cltk import NLP
from cltk.alphabet.processes import GreekNormalizeProcess

from ajax_odysseus_speech_analysis.group_speeches import AjaxDocument

In [2]:
DOC = AjaxDocument()
GREEK_STOPS = [unicodedata.normalize('NFC', s) for s in STOPS]
ajax = 'Αἴας'
odysseus = 'Ὀδυσσεύς'

‎𐤀 CLTK version '1.1.6'.
Pipeline for language 'Ancient Greek' (ISO: 'grc'): `GreekNormalizeProcess`, `GreekStanzaProcess`, `GreekEmbeddingsProcess`, `StopsProcess`.


In [3]:
speeches = DOC.group_lines()

In [96]:
from dataclasses import dataclass

def docify_lines(speaker):
    lines = speeches[speaker]
    docified_lines = []
    
    for l in lines:
        docified_lines.append(l)
    
    print(docified_lines)
    return "\n".join(docified_lines)

def clean_string(s):
    return s.strip(string.punctuation)

@dataclass
class Line:
    n: str
    raw_text: str
    words: list[any]

def analyze_lines(speaker):
    lines = speeches[speaker]
    analyzed_lines = []
    for line in lines:
        analyzed = DOC.nlp.analyze(line[1])
        analyzed_lines.append(Line(n=line[0], raw_text=line[1], words=analyzed.words))
        
    return analyzed_lines

In [115]:
def calculate_lemma_tf(term, doc):
    counts = {}
    for word in doc:
        if counts.get(word.lemma) is not None:
            counts[word.lemma] += 1
        else:
            counts[word.lemma] = 1
            
    total_terms = sum([count for t, count in counts.items()])
    return counts.get(term.lemma) / total_terms

In [116]:
def calculate_lemma_idf(term, docs):
    n = len(docs)
    d = 1 + sum([1 for d in docs if term.lemma in d])
    
    return math.log(n / d, 2)

In [97]:
all_speaker_lines_analyzed = dict([(speaker, analyze_lines(speaker)) for speaker, _lines in speeches.items()])

In [81]:
speaker_docs = []

for _s, ls in all_speaker_lines_analyzed.items():
    for l in ls:
        speaker_docs += [w.lemma for w in l.words]

In [117]:
from itertools import chain

def tf_idf_for_speaker(speaker):
    speaker_words = list(chain.from_iterable([l.words for l in all_speaker_lines_analyzed[speaker]]))
    speaker_tfs = [(term, calculate_lemma_tf(term, speaker_words)) for term in speaker_words]
    speaker_idfs = [(term, calculate_lemma_idf(term, speaker_docs)) for term in speaker_words]
    
    tf_idfs = []
    for (term, tf) in speaker_tfs:
        idf = next(idf for (t, idf) in speaker_idfs if t == term)
        tf_idfs.append((term.lemma, tf*idf))
    
    return dict(tf_idfs)

In [118]:
tf_idfs = dict([(speaker, tf_idf_for_speaker(speaker)) for speaker, _lines in speeches.items()])

In [121]:
def create_ngrams(word_list, n):
    ngrams = []
 
    for num in range(0, len(word_list)):
        ngram = tuple(word_list[num:num + n])
        ngrams.append(ngram)
 
    return ngrams

def score_ngram(ngram, weights, max_score):
    scores = [weights.get(w[0].lemma, 0) for w in ngram]
    
    return (sum(scores) / len(scores)) / max_score

In [127]:
def compare_speakers_with_unigrams(speaker_a, speaker_b):
    speaker_a_lemmata = all_speaker_lemmata[speaker_a]
    speaker_b_tf_idf = tf_idfs[speaker_b]
    
    return [speaker_b_tf_idf.get(l, 0) for l in speaker_a_lemmata]

def compare_speakers_with_ngrams(speaker_a, speaker_b, n):
    speaker_a_lines = all_speaker_lines_analyzed[speaker_a]
    speaker_a_words = []
    for line in speaker_a_lines:
        for word in line.words:
            speaker_a_words.append((word, line))
    speaker_a_ngrams = create_ngrams(speaker_a_words, n)
    speaker_b_tf_idf = tf_idfs[speaker_b]
    max_tf_idf = speaker_b_tf_idf[max(speaker_b_tf_idf, key=speaker_b_tf_idf.get)]
    
    scored_ngrams = [score_ngram(ngram, speaker_b_tf_idf, max_tf_idf) for ngram in speaker_a_ngrams]
    
    return (speaker_a_ngrams, scored_ngrams)

In [133]:
def plot_comparison(speaker_a, speaker_b, n=4):
    speaker_a_ngrams, scores = compare_speakers_with_ngrams(speaker_a, speaker_b, n)
    ngram_score_dict = dict(zip(speaker_a_ngrams, scores))
    x_axis = range(0, len(scores))
    fig, ax = plt.subplots()
    fig.suptitle("{} compared to {} by {}s".format(speaker_a, speaker_b, n))
    high_score_key = max(ngram_score_dict, key=ngram_score_dict.get)
    ax.annotate(" ".join(high_score_key), xy=(speaker_a_ngrams.index(high_score_key), ngram_score_dict[high_score_key]))
    ax.plot(x_axis, scores)
    plt.ylim([0, 0.6])
    plt.show()

In [124]:
import itertools

In [125]:
pairs = itertools.permutations([s for s, _l in tf_idfs.items()], 2)
only_main_actors = ['Αἴας', 'Ὀδυσσεύς', 'Τεῦκρος', 'Μενέλαος', 'Ἀγαμέμνων', 'Τέκμησσα', 'Ἀθήνα']
pairs = [p for p in pairs if p[0] in only_main_actors and p[1] in only_main_actors]

In [134]:
for pair in pairs:
    plot_comparison(pair[0], pair[1], 6)

((Word(index_char_start=None, index_char_stop=None, index_token=0, index_sentence=0, string='σὲ', pos=pronoun, lemma='σύ', stem=None, scansion=None, xpos='Pp', upos='PRON', dependency_relation='obj', governor=5, features={Case: [accusative], Gender: [masculine], Number: [singular], Person: [second], PrononimalType: [personal]}, category={F: [pos], N: [pos], V: [neg]}, stop=False, named_entity=None, syllables=None, phonetic_transcription=None, definition=None), Line(n='1226', raw_text='σὲ δὴ τὰ δεινὰ ῥήματʼ ἀγγέλλουσί μοι', words=[Word(index_char_start=None, index_char_stop=None, index_token=0, index_sentence=0, string='σὲ', pos=pronoun, lemma='σύ', stem=None, scansion=None, xpos='Pp', upos='PRON', dependency_relation='obj', governor=5, features={Case: [accusative], Gender: [masculine], Number: [singular], Person: [second], PrononimalType: [personal]}, category={F: [pos], N: [pos], V: [neg]}, stop=False, named_entity=None, syllables=None, phonetic_transcription=None, definition=None), W

TypeError: unhashable type: 'Word'