In [19]:
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats import spearmanr

In [20]:
class HALModel:
    def __init__(self, window_size=2):
        self.window_size = window_size
        self.word_index = {}
        self.context_index = {}
        self.co_occurrence_matrix = None

    def build_co_occurrence_matrix(self, sentences):
        # Build word and context indices
        for sentence in sentences:
            for word in sentence:
                if word not in self.word_index:
                    self.word_index[word] = len(self.word_index)
                for context_word in sentence:
                    if context_word != word:
                        if context_word not in self.context_index:
                            self.context_index[context_word] = len(self.context_index)

        # Initialize the co-occurrence matrix
        self.co_occurrence_matrix = np.zeros((len(self.word_index), len(self.context_index)))

        # Fill the co-occurrence matrix based on word-context pairs
        for sentence in sentences:
            for i, target_word in enumerate(sentence):
                start = max(0, i - self.window_size)
                end = min(len(sentence), i + self.window_size + 1)
                context_words = sentence[start:i] + sentence[i + 1:end]
                target_index = self.word_index.get(target_word, None)
                if target_index is not None:
                    for context_word in context_words:
                        context_index = self.context_index.get(context_word, None)
                        if context_index is not None:
                            self.co_occurrence_matrix[target_index, context_index] += 1

    def train(self, sentences):
        self.build_co_occurrence_matrix(sentences)

    def similarity(self, word1, word2):
        if self.co_occurrence_matrix is None:
            raise ValueError("Model has not been trained. Call train() first.")

        # Check if both words are in the vocabulary
        if word1 not in self.word_index or word2 not in self.word_index:
            # Handle the case where one or both words are not in the vocabulary
            return 0.0  # You can adjust this default value as needed

        index1, index2 = self.word_index[word1], self.word_index[word2]
        vector1 = self.co_occurrence_matrix[index1, :]
        vector2 = self.co_occurrence_matrix[index2, :]
        similarity = cosine_similarity([vector1], [vector2])[0][0]
        return similarity

In [21]:
def evaluate_similarity(model, word_pairs, human_scores):
    predicted_scores = [model.similarity(word1, word2) for word1, word2 in word_pairs]
    spearman_corr, _ = spearmanr(predicted_scores, human_scores)
    return spearman_corr

In [22]:
def k_fold_cross_validation(model, sentences, word_pairs, human_scores, k=10):
    kf = KFold(n_splits=k, shuffle=True, random_state=42)
    spearman_correlations = []

    for fold, (train_indices, val_indices) in enumerate(kf.split(sentences)):
        print(f"\nFold {fold + 1}")

        # Split the data into training and validation sets
        train_sentences = [sentences[i] for i in train_indices]
        val_sentences = [sentences[i] for i in val_indices]

        # Train the HAL model on the training set
        model.train(train_sentences)

        # Evaluate the model on the validation set
        spearman_corr = evaluate_similarity(model, word_pairs, human_scores)
        print(f"Spearman Correlation: {spearman_corr:.4f}")

        # Append the correlation for this fold to the list
        spearman_correlations.append(spearman_corr)

    # Calculate and print the overall average correlation across all folds
    overall_avg_corr = np.mean(spearman_correlations)
    print(f"\nOverall Average Spearman Correlation: {overall_avg_corr:.4f}")

In [23]:
from nltk.tokenize import word_tokenize
import string
import nltk
nltk.download('punkt')
import pandas as pd

lines = []
with open('shakespeare.txt', 'r') as f:
    for l in f:
        lines.append(l)

# remove new lines
lines = [line.rstrip() for line in lines]
# make all characters lower
lines = [line.lower() for line in lines]
# remove punctuations from each line
lines = [line.translate(str.maketrans('', '', string.punctuation)) for line in lines]
# tokenize
lines = [word_tokenize(line) for line in lines]
print(lines[:20])

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


[['act', 'i'], ['scene', 'i', 'london', 'the', 'palace'], ['enter', 'king', 'henry', 'lord', 'john', 'of', 'lancaster', 'the', 'earl', 'of', 'westmoreland', 'sir', 'walter', 'blunt', 'and', 'others'], ['so', 'shaken', 'as', 'we', 'are', 'so', 'wan', 'with', 'care'], ['find', 'we', 'a', 'time', 'for', 'frighted', 'peace', 'to', 'pant'], ['and', 'breathe', 'shortwinded', 'accents', 'of', 'new', 'broils'], ['to', 'be', 'commenced', 'in', 'strands', 'afar', 'remote'], ['no', 'more', 'the', 'thirsty', 'entrance', 'of', 'this', 'soil'], ['shall', 'daub', 'her', 'lips', 'with', 'her', 'own', 'childrens', 'blood'], ['nor', 'more', 'shall', 'trenching', 'war', 'channel', 'her', 'fields'], ['nor', 'bruise', 'her', 'flowerets', 'with', 'the', 'armed', 'hoofs'], ['of', 'hostile', 'paces', 'those', 'opposed', 'eyes'], ['which', 'like', 'the', 'meteors', 'of', 'a', 'troubled', 'heaven'], ['all', 'of', 'one', 'nature', 'of', 'one', 'substance', 'bred'], ['did', 'lately', 'meet', 'in', 'the', 'intesti

In [24]:
hal_model = HALModel(window_size=2)
hal_model.train(lines)

In [25]:
wordsim_path = 'combined.csv'
word_sim_data = pd.read_csv(wordsim_path)
word_pairs = [(w1,w2) for w1,w2 in zip(word_sim_data['Word 1'], word_sim_data['Word 2'])]
human_scores = word_sim_data['Human (mean)'].astype(float)

In [27]:
k_fold_cross_validation(hal_model, lines, word_pairs, human_scores, k=10)


Fold 1
Spearman Correlation: 0.0231

Fold 2
Spearman Correlation: 0.0391

Fold 3
Spearman Correlation: 0.0103

Fold 4
Spearman Correlation: 0.0173

Fold 5
Spearman Correlation: 0.0215

Fold 6
Spearman Correlation: 0.0159

Fold 7
Spearman Correlation: 0.0206

Fold 8
Spearman Correlation: 0.0158

Fold 9
Spearman Correlation: 0.0290

Fold 10
Spearman Correlation: 0.0072

Overall Average Spearman Correlation: 0.0200
