In [22]:
import os
import numpy as np
from scipy.special import softmax
from scipy.spatial.distance import cdist
from sklearn.cluster import DBSCAN
from collections import defaultdict
from gensim.models import FastText
import random

# Set up constants
model_path = "/Users/ryderwishart/w2v/ancient-greek-word2vec/models/ft_papyri&corpus_cbow_hs_2_to_5_size300_window5_mincount2.model"
lemmas_path = "/Users/ryderwishart/biblical-machine-learning/data/lemmas"
texts_path = "/Users/ryderwishart/biblical-machine-learning/data/texts"
eps = 0.5
base_min_samples = 2
scaling_factor = 0.1
num_examples = 3
num_lexemes = 10

In [23]:
# Load FastText model
model = FastText.load(model_path)

In [27]:
class MySentences(object):
    def __init__(self, lemmas_path, texts_path):
        self.lemmas_path = lemmas_path
        self.texts_path = texts_path
        self.lemmas_files = self._get_txt_files(self.lemmas_path)
        self.texts_files = self._get_txt_files(self.texts_path)

    def __iter__(self):
        for lemmas_file, texts_file in zip(self.lemmas_files, self.texts_files):
            with open(lemmas_file, 'r', encoding='utf-8') as lemmas_f, open(texts_file, 'r', encoding='utf-8') as texts_f:
                for lemmas_line, texts_line in zip(lemmas_f, texts_f):
                    lemmas = lemmas_line.rstrip().split()
                    yield lemmas

    def _get_txt_files(self, path):
        txt_files = []
        for file in os.listdir(path):
            if file.endswith(".txt"):
                txt_files.append(os.path.join(path, file))
        return txt_files

    def get_sentence_by_index(self, index):
        i = 0
        for lemmas_file, texts_file in zip(self.lemmas_files, self.texts_files):
            with open(lemmas_file, 'r', encoding='utf-8') as lemmas_f, open(texts_file, 'r', encoding='utf-8') as texts_f:
                for line_number, (lemmas_line, texts_line) in enumerate(zip(lemmas_f, texts_f)):
                    lemmas = lemmas_line.rstrip().split()
                    texts = texts_line.rstrip().split()
                    if i == index:
                        if len(lemmas) != len(texts):
                            texts.append('[LENGTH_MISMATCH]')
                        return lemmas
                    i += 1

# Instantiate corpus reader
corpus = MySentences(lemmas_path, texts_path)

# Function to read and preprocess the corpus
# def preprocess_corpus(corpus_path):
#     with open(corpus_path, 'r', encoding='utf-8') as f:
#         lines = f.read().splitlines()
#         sentences = [line.split() for line in lines]
#         return sentences

# corpus = preprocess_corpus(corpus_path)

# Select random lexemes
N = 100  # or some other reasonable number
sentences = [sentence for _, sentence in zip(range(N), corpus)]
unique_lexemes = list(set([token for sentence in sentences for token in sentence]))
random_lexemes = random.sample(unique_lexemes, num_lexemes)

In [28]:
# Function to get word embeddings
def get_word_embedding(token, model):
    if token in model.wv:
        return model.wv[token]
    else:
        return np.zeros(model.vector_size)

# Function to calculate contextual vectors
def calculate_contextual_vectors(sentence, model, window_size=5):
    contextual_vectors = []
    for i, token in enumerate(sentence):
        token_embedding = get_word_embedding(token, model)
        contextual_vector = np.zeros(model.vector_size)
        for j in range(max(0, i - window_size), min(len(sentence), i + window_size + 1)):
            if j != i:
                neighbor_token = sentence[j]
                neighbor_embedding = get_word_embedding(neighbor_token, model)
                if neighbor_token != "[PAD]":
                    distance = cdist(token_embedding.reshape(1, -1), neighbor_embedding.reshape(1, -1), metric='cosine')
                    attention_score = softmax(-distance)
                    contextual_vector += attention_score.reshape(-1) * neighbor_embedding
        contextual_vector = contextual_vector / np.linalg.norm(contextual_vector)
        contextual_vectors.append(contextual_vector)
    return np.array(contextual_vectors)

token_context_vectors = defaultdict(list)
for sentence_idx, sentence in enumerate(corpus):
    if any(lexeme in sentence for lexeme in random_lexemes):
        contextual_vectors = calculate_contextual_vectors(sentence, model)
        for token_idx, (token, context_vector) in enumerate(zip(sentence, contextual_vectors)):
            if token in random_lexemes:
                token_context_vectors[token].append((context_vector, sentence_idx, token_idx))


def calculate_clusters_and_generalized_vectors(corpus, lexemes):
    token_context_vectors = defaultdict(list)
    for sentence_idx, sentence in enumerate(corpus):
        if any(lexeme in sentence for lexeme in lexemes):
            contextual_vectors = calculate_contextual_vectors(sentence, model)
            for token_idx, (token, context_vector) in enumerate(zip(sentence, contextual_vectors)):
                if token in lexemes:
                    token_context_vectors[token].append((context_vector, sentence_idx, token_idx))

    token_clusters = {}
    for token, context_vector_data in token_context_vectors.items():
        context_vectors = [data[0] for data in context_vector_data]

        # Filter out context vectors containing NaN values
        context_vectors = [vec for vec in context_vectors if not np.isnan(vec).any()]
        
        if len(context_vectors) == 0:
            continue
            
        token_frequency = len(context_vector_data)
        min_samples_scaled = int(base_min_samples + (scaling_factor * (token_frequency ** 0.5)))
        clustering = DBSCAN(eps=eps, min_samples=min_samples_scaled).fit(context_vectors)
        token_clusters[token] = clustering.labels_

    token_generalized_vectors = {}
    for token, context_vector_data in token_context_vectors.items():
        context_vectors = [data[0] for data in context_vector_data]
        if token not in token_clusters:
            token_generalized_vectors[token] = context_vectors
            continue

        labels = token_clusters[token]
        num_clusters = len(set(labels)) - (1 if -1 in labels else 0)
        generalized_vectors = []
        for cluster_idx in range(num_clusters):
            cluster_vectors = [vec for vec, label in zip(context_vectors, labels) if label == cluster_idx]
            centroid = np.mean(cluster_vectors, axis=0)
            generalized_vectors.append(centroid)
        token_generalized_vectors[token] = generalized_vectors
    return token_clusters, token_generalized_vectors

token_clusters, token_generalized_vectors = calculate_clusters_and_generalized_vectors(corpus, random_lexemes)

# Function to select example sentences
def select_example_sentences(corpus, lexemes, token_clusters, token_generalized_vectors, token_context_vectors):
    token_examples = defaultdict(lambda: defaultdict(list))
    for token, context_vector_data in token_context_vectors.items():
        context_vectors = [data[0] for data in context_vector_data]
        if token not in token_clusters:
            for context_vector, vector_data in zip(context_vectors, context_vector_data):
                sentence_idx, token_idx = vector_data[1:]
                token_examples[token][-1].append((0, sentence_idx, token_idx))
            token_examples[token][-1] = token_examples[token][-1][:num_examples]
            continue

        labels = token_clusters[token]
        generalized_vectors = token_generalized_vectors[token]
        for context_vector, label, vector_data in zip(context_vectors, labels, context_vector_data):
            sentence_idx, token_idx = vector_data[1:]
            if label != -1:
                centroid = generalized_vectors[label]
                distance = cdist(context_vector.reshape(1, -1), centroid.reshape(1, -1), metric='cosine')
                token_examples[token][label].append((distance, sentence_idx, token_idx))
                token_examples[token][label].sort(key=lambda x: x[0])
                token_examples[token][label] = token_examples[token][label][:num_examples]
    return token_examples

token_examples = select_example_sentences(corpus, random_lexemes, token_clusters, token_generalized_vectors, token_context_vectors)




  contextual_vector = contextual_vector / np.linalg.norm(contextual_vector)


In [29]:
# Display output
for token, context_examples in token_examples.items():
    number_of_clusters_for_token = len(context_examples)
    print(f"{token} ({number_of_clusters_for_token} cluster/s):")
    for context_label, examples in context_examples.items():
        print(f"\tgeneralized_context_vector_{context_label}:")
        for example in examples:
            distance, sentence_idx, token_idx = example
            sentence = corpus.get_sentence_by_index(sentence_idx)
            print(
                f"\t\tExample sentence {sentence_idx} including '{token}' in typical context {context_label}:")
            print(f"\t\t\tTokens: {' '.join(sentence)}")
        print("\t...")
    print("\n")

φθάνω (1 cluster/s):
	generalized_context_vector_0:
		Example sentence 124346 including 'φθάνω' in typical context 0:
			Tokens: ὁ φθάνω
		Example sentence 270841 including 'φθάνω' in typical context 0:
			Tokens: ὁ γάρ ἀχαριστέω ὁ φθάνω καί ὁ ἀπό ὁ αὖθις ἀκούω ἔχω κατάγνωσις
		Example sentence 35202 including 'φθάνω' in typical context 0:
			Tokens: οὐ οἶδα ὅτι2 πρός ὁ πρέσβυς βλέπω ὁ δεύτερος κρίνω ἄνθρωπος καί διδαχή γίγνομαι ὁ ὁ φθάνω
	...


μέγας (3 cluster/s):
	generalized_context_vector_0:
		Example sentence 8 including 'μέγας' in typical context 0:
			Tokens: ἐπεί γάρ ὁ ἐλπίς αὐτός καί οὐ ἀφίημι πρός ὁ πάρειμι καί καθά ἄγκυρα ἀποκόπτω ὁ διάβολος ἐπεί οὐ ἰσχύω ὅτι2 ψευδής ὁ μέλλω ἕτερος ἔρχομαι ὁδός καί ἄνθρωπος λυμεών ἐπιχειρέω ὁ μένος ὅτι2 ὁ μέγας ἐκεῖνος ὁ πέρας
		Example sentence 63 including 'μέγας' in typical context 0:
			Tokens: μέγας μέν οὖν καί ὁ καί ὑπό ὁ λογισμός μή παραφέρω
		Example sentence 91 including 'μέγας' in typical context 0:
			Tokens: ἀλλά ὁράω διά ὁ ἑξῆς