In [1]:
import csv
from sklearn.cluster import DBSCAN
from collections import defaultdict
from itertools import product
from sklearn.metrics.pairwise import cosine_similarity
from scipy.special import softmax
from scipy.spatial.distance import cdist
import itertools
import numpy as np
from gensim.utils import simple_preprocess
from gensim.models import FastText
import os
import codecs

In [2]:
# set paths
# corpus_path = "/Users/ryderwishart/w2v/ancient-greek-word2vec/data/corpus"
model_path = "/Users/ryderwishart/w2v/ancient-greek-word2vec/models/ft_papyri&corpus_cbow_hs_2_to_5_size300_window5_mincount2.model"


## Check for missing files in either the lemmatized or non-lemmatized directories

In [101]:
import os

lemmatized_path = "/Users/ryderwishart/biblical-machine-learning/data/lemmas"
non_lemmatized_path = "/Users/ryderwishart/biblical-machine-learning/data/texts"

lemmatized_files = set(os.listdir(lemmatized_path))
non_lemmatized_files = set(os.listdir(non_lemmatized_path))

missing_from_lemmatized = non_lemmatized_files - lemmatized_files
missing_from_non_lemmatized = lemmatized_files - non_lemmatized_files

if missing_from_lemmatized:
    print("Missing files in lemmatized directory:")
    print(missing_from_lemmatized)

if missing_from_non_lemmatized:
    print("Missing files in non-lemmatized directory:")
    print(missing_from_non_lemmatized)
    
missing_files = missing_from_lemmatized | missing_from_non_lemmatized

Missing files in non-lemmatized directory:
{'tlg0013.tlg015.perseus-grc2.txt', 'tlg0013.tlg013.perseus-grc2.txt', 'tlg0013.tlg033.perseus-grc2.txt', 'tlg2313.perseus001.perseus-grc1.txt', 'tlg0082.tlg001.1st1K-grc1.txt', 'tlg0565.tlg002.1st1K-grc1.txt', 'tlg0082.tlg002.1st1K-grc1.txt', 'tlg2313.perseus002.perseus-grc1.txt', 'tlg0013.tlg020.perseus-grc2.txt'}


## Preprocessing: Tokenize and lemmatize the corpus, splitting it into sentences or documents.

In [168]:
# Define preprocessing functions and pad each sentence to the same length
force_lowercase = True
use_lemma_disambiguation = False # Some lemmas are indicated with a numeric suffix (e.g., 'ὅτι2')

# Change to the directory entered (this is necessary to use the codecs.open() method). 
# TODO: rewrite this corpus iterator without the codecs module.

# if not(os.getcwd().split('/')[-1].endswith('corpus')):
#     os.chdir(corpus_directory)

# This class streams through the corpus when called.

def tokenize(string):
    output = string
    if use_lemma_disambiguation:
        pass
    else:
        # Filter numeric digits from token
        output = ''.join(filter(lambda x: not x.isdigit(), string))
    if force_lowercase:
        return [token.lower() for token in output.split()]
    else:
        return output.split()
    
import os
import codecs
import csv

import os
import codecs

class MySentences(object):
    def __init__(self, tsv_path):
        self.tsv_path = tsv_path
        self.files = self._get_tsv_files()

    def __iter__(self):
        for tsv_file in self.files:
            with open(tsv_file, 'r', encoding='utf-8') as f:
                header = next(f)
                for line in f:
                    row = line.rstrip().split('\t')
                    if len(row) > 3 and row[3]:
                        yield tokenize(row[3])
                    else:
                        yield []

    def _get_tsv_files(self):
        tsv_files = []
        for file in os.listdir(self.tsv_path):
            if file.endswith(".tsv"):
                tsv_files.append(os.path.join(self.tsv_path, file))
        return tsv_files

    def get_sentence(self, index, id=None, filename=None):
        if id is not None and filename is not None:
            tsv_file = os.path.join(self.tsv_path, filename)
            with open(tsv_file, 'r', encoding='utf-8') as f:
                header = next(f)
                for line_number, line in enumerate(f):
                    row = line.rstrip().split('\t')
                    if int(row[0]) == id:
                        tokens = tokenize(row[2])
                        lemmas = tokenize(row[3])
                        if len(tokens) != len(lemmas):
                            tokens.append('[LENGTH_MISMATCH]')
                        return lemmas, tokens, tsv_file, line_number

        i = 0
        for tsv_file in self.files:
            with open(tsv_file, 'r', encoding='utf-8') as f:
                header = next(f)
                for line_number, line in enumerate(f):
                    row = line.rstrip().split('\t')
                    tokens = tokenize(row[2])
                    lemmas = tokenize(row[3])
                    if i == index:
                        if len(tokens) != len(lemmas):
                            tokens.append('[LENGTH_MISMATCH]')
                        return lemmas, tokens, tsv_file, line_number
                    i += 1


    

In [169]:
# Instantiate corpus reader
corpus = MySentences('/Users/ryderwishart/Documents/Storage/Programming/Corpora/celano/LemmatizedAncientGreekXML/sentence_pair_tsvs/')

In [170]:
# Calculate max_sentence_length
max_sentence_length = 0
for sentence in corpus:
    if len(sentence) > max_sentence_length:
        max_sentence_length = len(sentence)

In [171]:
max_sentence_length

543

In [172]:
# pad all sentences to maximum length with [PAD] token
# for i, sentence in enumerate(corpus):
#     if len(sentence) == 0:
#         pass
#     elif len(sentence) < max_sentence_length:
#         sentence.extend(['[PAD]'] * (max_sentence_length - len(sentence)))
        
def pad_sentence(sentence, max_sentence_length):
    if len(sentence) == 0:
        return sentence
    elif len(sentence) < max_sentence_length:
        sentence.extend(['[PAD]'] * (max_sentence_length - len(sentence)))
        return sentence
    else:
        return sentence


## Initialization of Vocabulary Embeddings: Start with global vectors (e.g., FastText or Word2Vec) as initialization of vocabulary embeddings.

In [173]:
# initialize FastText model with pre-trained embeddings
model = FastText.load(model_path)

## Contextual Vectors: For each sentence in the corpus, calculate the contextual vector for each token by taking a weighted sum of the token's global vector and the global vectors of its neighboring tokens. The weighting can be done using a Gaussian kernel, with the distance between tokens as the input to the kernel.

In [175]:
# set the number of samples (None to use complete corpus, some subset for testing)
samples = 1000

# define function to calculate contextual vectors for a sentence


def get_word_embedding(token, model):
    if token in model.wv:
        return model.wv[token].reshape(-1)
    else:
        return np.zeros(model.vector_size)


def calculate_contextual_vectors(sentence, model, window_size=5):
    contextual_vectors = []
    embedding_size = model.vector_size
    sentence_len = len(sentence)

    for i, token in enumerate(sentence):
        # Get the global vector for the token from the FastText model
        token_embedding = get_word_embedding(token, model)
        # print('token_embedding.shape', token_embedding.shape)
        # Initialize the contextual vector as the global vector
        contextual_vector = np.zeros(embedding_size)
        # print('contextual_vector.shape', contextual_vector.shape)
        # Calculate the attention scores for neighboring tokens within the window
        for j in range(max(0, i - window_size), min(sentence_len, i + window_size + 1)):
            if j != i:
                neighbor_token = sentence[j]
                neighbor_embedding = get_word_embedding(neighbor_token, model)
                # Print shape of neighbour embedding
                # print('neighbor_embedding.shape', neighbor_embedding.shape)
                if neighbor_token != "[PAD]":
                    # Calculate the distance between the embeddings
                    distance = cdist(token_embedding.reshape(
                        1, -1), neighbor_embedding.reshape(1, -1), metric='cosine')

                    # Convert distance to attention score using softmax
                    attention_score = softmax(-distance)

                    # Update the contextual vector with a weighted sum of the neighbor's global vector
                    contextual_vector += attention_score.reshape(-1) * \
                        neighbor_embedding

        # Normalize the contextual vector
        contextual_vector = contextual_vector / np.linalg.norm(contextual_vector)

        contextual_vectors.append(contextual_vector)

    return np.array(contextual_vectors)


# Calculate contextual vectors for a subset of the corpus
contextual_vectors_corpus = []
for i, sentence in enumerate(corpus):
    padded_sentence = pad_sentence(sentence, max_sentence_length)
    if padded_sentence is None:
        print('padded_sentence is None')
        pass
    elif i < samples:
        # print(i, sentence)
        contextual_vectors = calculate_contextual_vectors(padded_sentence, model)
        contextual_vectors_corpus.append(contextual_vectors)


  contextual_vector = contextual_vector / np.linalg.norm(contextual_vector)


## Token-based Contextual Vectors: Create a dictionary where the keys are tokens, and the values are lists of their contextual vectors. Also, store the sentence index and token index in the sentence for each contextual vector.

### Clustering: For each token, perform clustering on its contextual vectors. You can use a clustering algorithm like DBSCAN, which doesn't require specifying the number of clusters in advance, or other unsupervised clustering methods.

In [181]:
# Set cluster params
eps = 0.5 # The maximum distance between two samples for them to be considered as in the same neighborhood.
base_min_samples = 2  # Base value for min_samples
scaling_factor = 0.1  # Adjust this value to control the influence of token frequency on min_samples

In [189]:
import numpy as np
import math

# 1. Token-based Contextual Vectors
token_context_vectors = defaultdict(list)

for sentence_idx, (sentence, sentence_context_vectors) in enumerate(zip(corpus, contextual_vectors_corpus)):
    for token_idx, (token, context_vector) in enumerate(zip(sentence, sentence_context_vectors)):
        token_context_vectors[token].append(
            (context_vector, sentence_idx, token_idx))

# 2. Clustering
token_clusters = {}
for token, context_vector_data in token_context_vectors.items():
    context_vectors = [data[0] for data in context_vector_data if len(data[0]) > 0]
    
    # Filter out context vectors containing NaN values
    context_vectors = [vec for vec in context_vectors if not np.isnan(vec).any()]
    
    if len(context_vectors) == 0:
        continue
    else:
        token_frequency = len(context_vector_data)
        # min_samples_scaled = int(base_min_samples + (scaling_factor * math.log(token_frequency + 1))) # log scaling
        min_samples_scaled = int(base_min_samples + (scaling_factor * (token_frequency ** 0.5))) # sqrt/power-law scaling

        clustering = DBSCAN(eps=eps, min_samples=min_samples_scaled).fit(context_vectors)
        token_clusters[token] = clustering.labels_

### Generalized Context Vectors: For each cluster, calculate the centroid (mean vector) of the contextual vectors in the cluster. This will be the generalized context vector for the cluster.

In [190]:
# 3. Generalized Context Vectors
token_generalized_vectors = {}
for token, context_vector_data in token_context_vectors.items():
    context_vectors = [data[0] for data in context_vector_data]

    if token not in token_clusters:  # If the token doesn't have a cluster
        token_generalized_vectors[token] = context_vectors
        continue

    labels = token_clusters[token]
    # Ignore noise points labeled as -1
    num_clusters = len(set(labels)) - (1 if -1 in labels else 0)
    generalized_vectors = []
    for cluster_idx in range(num_clusters):
        cluster_vectors = [vec for vec, label in zip(
            context_vectors, labels) if label == cluster_idx]
        centroid = np.mean(cluster_vectors, axis=0)
        generalized_vectors.append(centroid)
    token_generalized_vectors[token] = generalized_vectors

# 4. Example Sentences
token_examples = defaultdict(lambda: defaultdict(list))
num_examples = 3

for token, context_vector_data in token_context_vectors.items():
    context_vectors = [data[0] for data in context_vector_data]

    if token not in token_clusters:  # If the token doesn't have a cluster
        for context_vector, vector_data in zip(context_vectors, context_vector_data):
            sentence_idx, token_idx = vector_data[1:]
            lemmas, tokens, file, line_number = corpus.get_sentence(sentence_idx)
            token_examples[token][-1].append((0, sentence_idx, token_idx, file, line_number))
        token_examples[token][-1] = token_examples[token][-1][:num_examples]
        continue

    labels = token_clusters[token]
    generalized_vectors = token_generalized_vectors[token]
    for context_vector, label, vector_data in zip(context_vectors, labels, context_vector_data):
        sentence_idx, token_idx = vector_data[1:]
        lemmas, tokens, file, line_number = corpus.get_sentence(sentence_idx)
        if label != -1:  # Ignore noise points
            centroid = generalized_vectors[label]
            distance = cdist(context_vector.reshape(1, -1),
                             centroid.reshape(1, -1), metric='cosine')
            token_examples[token][label].append(
                (distance, sentence_idx, token_idx, file, line_number))
            token_examples[token][label].sort(
                key=lambda x: x[0])  # Sort by distance
            # Keep top examples
            token_examples[token][label] = token_examples[token][label][:num_examples]


In [191]:
# Print output
for token, context_examples in token_examples.items():
    number_of_clusters_for_token = len(context_examples)
    print(f"{token} ({number_of_clusters_for_token} cluster/s):")
    for context_label, examples in context_examples.items():
        print(f"\tgeneralized_context_vector_{context_label}:")
        for example in examples:
            distance, sentence_idx, token_idx, file, line_number = example
            lemmas, tokens, _, _ = corpus.get_sentence(sentence_idx)
            print(
                f"\t\tExample sentence {sentence_idx} including '{token}' in typical context {context_label}:")
            if tokens:
                print(f"\t\t\tTokens: {' '.join(tokens)}")
            print(f"\t\t\tLemmas: {' '.join(lemmas)}")
        # print("\t...")
    print("\n")

. (5 cluster/s):
	generalized_context_vector_0:
		Example sentence 2 including '.' in typical context 0:
			Tokens: πεπαιδευμένου γάρ ἐστι κατὰ τρόπον τὸ δύνασθαι κρῖναι εὐστόχως τί καλῶς ἢ μὴ καλῶς ἀποδίδωσιν ὁ λέγων.
			Lemmas: γάρ εἰμί κατά τρόπος ὁ δύναμαι κρίνω εὔστοχος τίς καλός τίη μή καλός ἀποδίδωμι ὁ λέγω .
		Example sentence 18 including '.' in typical context 0:
			Tokens: / πρὸς δὲ τούτοις, ἐπεὶ πλείους ὁρῶμεν αἰτίας περὶ τὴν γένεσιν τὴν φυσικήν, οἷον τήν θʼ οὗ ἕνεκα καὶ τὴν ὅθεν ἡ ἀρχὴ τῆς κινήσεως, διοριστέον καὶ περὶ τούτων, ποία πρώτη καὶ δευτέρα πέφυκεν. [LENGTH_MISMATCH]
			Lemmas: πρός δέ οὗτος , ἐπεί πολύς ὁράω αἰτία περί ὁ γένεσις ὁ φυσικός , οἷος ὁ οὗ ἕνεκα καί ὁ ὅθεν ὁ ἀρχή ὁ κίνησις , διορίζω καί περί οὗτος , ποιός πρῶτος καί δεύτερος φύω .
		Example sentence 24 including '.' in typical context 0:
			Tokens: σχεδὸν τοὺς λόγους ἀνάγειν, οὐ διελόμενοι ποσαχῶς λέγεται τὸ ἀναγκαῖον. [LENGTH_MISMATCH]
			Lemmas: σχεδόν ὁ λόγος ἀνάγω , οὐ διαιρέω ποσαχῶς λέγω ὁ ἀναγκα