In [1]:
import numpy as np
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import KFold
from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats import spearmanr

In [10]:
class LSAModel:
    def __init__(self, num_topics=100):
        self.num_topics = num_topics
        self.vectorizer = CountVectorizer()
        self.svd = TruncatedSVD(n_components=num_topics)
        self.doc_topic_matrix = None

    def train(self, documents):
        # Concatenate lines into a single string for each document
        concatenated_documents = [' '.join(doc) for doc in documents]

        # Convert the documents into a term-document matrix
        term_doc_matrix = self.vectorizer.fit_transform(concatenated_documents)

        # Apply LSA using TruncatedSVD
        self.doc_topic_matrix = self.svd.fit_transform(term_doc_matrix)

    def similarity(self, doc1, doc2):
        if self.doc_topic_matrix is None:
            raise ValueError("Model has not been trained. Call train() first.")

        # Transform documents into topic space
        vec1 = self.svd.transform(self.vectorizer.transform([doc1]))
        vec2 = self.svd.transform(self.vectorizer.transform([doc2]))

        # Calculate cosine similarity between document vectors
        similarity = cosine_similarity(vec1, vec2)[0][0]
        return similarity

In [11]:
def evaluate_similarity(model, documents, word_pairs, human_scores):
    predicted_scores = [model.similarity(doc1, doc2) for doc1, doc2 in word_pairs]
    spearman_corr, _ = spearmanr(predicted_scores, human_scores)
    return spearman_corr

In [12]:
def k_fold_cross_validation(model, documents, word_pairs, human_scores, k=5):
    kf = KFold(n_splits=k, shuffle=True, random_state=42)
    spearman_correlations = []

    for fold, (train_indices, val_indices) in enumerate(kf.split(documents)):
        print(f"\nFold {fold + 1}")

        # Split the data into training and validation sets
        train_documents = [documents[i] for i in train_indices]
        val_documents = [documents[i] for i in val_indices]

        # Train the LSA model on the training set
        model.train(train_documents)

        # Evaluate the model on the validation set
        spearman_corr = evaluate_similarity(model, val_documents, word_pairs, human_scores)
        print(f"Spearman Correlation: {spearman_corr:.4f}")

        # Append the correlation for this fold to the list
        spearman_correlations.append(spearman_corr)

    # Calculate and print the overall average correlation across all folds
    overall_avg_corr = np.mean(spearman_correlations)
    print(f"\nOverall Average Spearman Correlation: {overall_avg_corr:.4f}")

In [13]:
from nltk.tokenize import word_tokenize
import string
import nltk
nltk.download('punkt')
import pandas as pd

lines = []
with open('shakespeare.txt', 'r') as f:
    for l in f:
        lines.append(l)

# remove new lines
lines = [line.rstrip() for line in lines]
# make all characters lower
lines = [line.lower() for line in lines]
# remove punctuations from each line
lines = [line.translate(str.maketrans('', '', string.punctuation)) for line in lines]
# tokenize
lines = [word_tokenize(line) for line in lines]
print(lines[:20])

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


[['act', 'i'], ['scene', 'i', 'london', 'the', 'palace'], ['enter', 'king', 'henry', 'lord', 'john', 'of', 'lancaster', 'the', 'earl', 'of', 'westmoreland', 'sir', 'walter', 'blunt', 'and', 'others'], ['so', 'shaken', 'as', 'we', 'are', 'so', 'wan', 'with', 'care'], ['find', 'we', 'a', 'time', 'for', 'frighted', 'peace', 'to', 'pant'], ['and', 'breathe', 'shortwinded', 'accents', 'of', 'new', 'broils'], ['to', 'be', 'commenced', 'in', 'strands', 'afar', 'remote'], ['no', 'more', 'the', 'thirsty', 'entrance', 'of', 'this', 'soil'], ['shall', 'daub', 'her', 'lips', 'with', 'her', 'own', 'childrens', 'blood'], ['nor', 'more', 'shall', 'trenching', 'war', 'channel', 'her', 'fields'], ['nor', 'bruise', 'her', 'flowerets', 'with', 'the', 'armed', 'hoofs'], ['of', 'hostile', 'paces', 'those', 'opposed', 'eyes'], ['which', 'like', 'the', 'meteors', 'of', 'a', 'troubled', 'heaven'], ['all', 'of', 'one', 'nature', 'of', 'one', 'substance', 'bred'], ['did', 'lately', 'meet', 'in', 'the', 'intesti

In [14]:
wordsim_path = 'combined.csv'
word_sim_data = pd.read_csv(wordsim_path)
word_pairs = [(w1,w2) for w1,w2 in zip(word_sim_data['Word 1'], word_sim_data['Word 2'])]
human_scores = word_sim_data['Human (mean)'].astype(float)

In [16]:
lsa_model = LSAModel(num_topics=100)
k_fold_cross_validation(lsa_model, lines, word_pairs, human_scores, k=10)


Fold 1
Spearman Correlation: 0.0142

Fold 2
Spearman Correlation: -0.0426

Fold 3
Spearman Correlation: 0.0090

Fold 4
Spearman Correlation: 0.0380

Fold 5
Spearman Correlation: 0.0119

Fold 6
Spearman Correlation: 0.0103

Fold 7
Spearman Correlation: -0.0127

Fold 8
Spearman Correlation: 0.0027

Fold 9
Spearman Correlation: -0.0076

Fold 10
Spearman Correlation: 0.0069

Overall Average Spearman Correlation: 0.0030
