Based on the paper:
Paul Hoffman, Matthew A. Lambon Ralph, and Timothy T. Rogers, “Semantic Diversity: A Measure of Semantic Ambiguity Based on Variability in the Contextual Usage of Words,” Behavior Research Methods 45, no. 3 (September 1, 2013): 718–30, https://doi.org/10.3758/s13428-012-0278-x.

This notebook is to recreate SemD calculations as per the specifications in the above paper. It doesn't use bert.

In [1]:
# %pip install -r requirements.txt

In [2]:
import numpy as np
import os
import matplotlib.pyplot as plt
from nltk.tokenize import word_tokenize
from transformers import BertTokenizer, BertModel
import torch
from collections import defaultdict
import re
from scipy.spatial import distance


# Corpus

## loading corpus

In [3]:
corpus = []

directory = '../Text data/'
for file in os.listdir(directory):
    if file.endswith('.txt'):
        with open(os.path.join(directory, file), 'r') as f:
            content = f.read().replace('\n', ' ')
            # content = tokenizer.tokenize(content) # Bert Tokenizer  will not preserve the original words
            content = word_tokenize(content) # NLTK Tokenizer will preserve the original words, but I'm not sure if they can be used with BERT
            corpus.extend(content)
# delete tokens that are not words
corpus = [word for word in corpus if re.match(r'^[a-zA-Z]+$', word)]

print(f"Length of Corpus: {len(corpus)}")
corpus[:20]

Length of Corpus: 953359


['A',
 'Midsummer',
 'Night',
 'Dream',
 'by',
 'William',
 'Shakespeare',
 'Edited',
 'by',
 'Barbara',
 'Mowat',
 'and',
 'Paul',
 'Werstine',
 'with',
 'Michael',
 'Poston',
 'and',
 'Rebecca',
 'Niles']

## context splitting

In [4]:
# Split corpus into contexts of length context_length
context_length = 100
current_context = []
contexts = []

for word in corpus:
    current_context.append(word)
    if len(current_context) == context_length:
        contexts.append(current_context)
        current_context = []

if current_context:
    contexts.append(current_context)

# Definitions

## LSA def:  perform_lsa(corpus)

In [9]:
def perform_lsa(corpus):
    """
    Perform latent semantic analysis on the given corpus.
    
    Args:
        corpus (list of lists): The corpus of text, where each inner list represents a context.
        context_length (int): The length of each context in words. Default is 1,000.
    
    Returns:
        word_vectors (dict): A dictionary mapping words to their 300-dimensional LSA vectors.
        context_vectors (list): A list of 300-dimensional LSA vectors, one for each context.
    """
    co_occurrence_matrix, word_to_index, index_to_word = create_co_occurrence_matrix(corpus)


    print(f'Co occurance matrix shape: {co_occurrence_matrix.shape}')
    print(f'Word to index mapping length: {len(word_to_index)}')
    print(f'Index to word mapping length: {len(index_to_word)}')

    # Apply log transformation 
    co_occurrence_matrix = np.log(co_occurrence_matrix + 1)

    # get word entropy of each word in the matrix
    word_entropies = np.sum(-co_occurrence_matrix * np.log(co_occurrence_matrix + 1e-10), axis=0) # Add a small value to avoid log(0)

    # Apply entropy weighting
    co_occurrence_matrix = np.divide(co_occurrence_matrix, word_entropies + 1e-10) # Add a small value to avoid division by zero
    co_occurrence_matrix = np.nan_to_num(co_occurrence_matrix)

    # Perform singular value decomposition
    u, s, vt = np.linalg.svd(co_occurrence_matrix, full_matrices=False)
    print(f'U shape: {u.shape}')
    print(f'Vt shape: {vt.shape}')
    print(f'S shape: {s.shape}')

    # word_vectors = {word: u[i, :300] for i, word in enumerate(index_to_word)} # 300 dimensions is supposedly optimal according to the article.
    word_vectors = {word: u[i, :] for i, word in enumerate(index_to_word)} # I'm doing all values unlike the article, because I'm not sure if the 300 dimensions are optimal.
    # context_vectors = vt[:300, :]
    context_vectors = vt[:, :]

    print(f'Word vectors length: {len(word_vectors)}')
    print(f'Context vectors length: {len(context_vectors)}')
    
    return word_vectors, context_vectors


## SemD def:  calculate_semd(word, word_vectors, context_vectors)

In [10]:

def calculate_semd(word, word_vectors, context_vectors):
    """
    Calculate the semantic diversity (SemD) of the given word.
    
    Args:
        word (str): The word for which to calculate SemD.
        word_vectors (dict): A dictionary mapping words to their 300-dimensional LSA vectors.
        context_vectors (list): A list of 300-dimensional LSA vectors, one for each context.
    
    Returns:
        semd (float): The semantic diversity value for the word.
    """
    # Find all contexts containing the word
    word_contexts = [i for i in context_vectors if word in word_vectors]
    # if len(word_contexts) > 2000:
    #     word_contexts = np.random.choice(word_contexts, size=2000, replace=False)

    # Calculate the average cosine similarity between the contexts
    context_similarities = [1 - distance.cosine(word_contexts[i], word_contexts[j]) for i in range(len(word_contexts)) for j in range(len(word_contexts)) if i < j]
    mean_similarity = np.mean(context_similarities)

    # Calculate the SemD value
    semd = -np.log(mean_similarity)
    
    return semd

## Co-occurence matrix def: create_co_occurrence_matrix(corpus)

In [11]:
def create_co_occurrence_matrix(corpus, window_size=len(corpus)):
    distinct_words = sorted(list(set(corpus)))
    num_words = len(distinct_words)
    
    print(f"Number of distinct words: {num_words}")
    print(f"Number of words in corpus: {len(corpus)}")

    # Create word to index and index to word mappings
    word_to_index = {word: index for index, word in enumerate(distinct_words)}
    index_to_word = {index: word for index, word in enumerate(distinct_words)}
    
    # Create an empty co-occurrence matrix
    co_occurrence_matrix = np.zeros((num_words, num_words))
    # only words with minimum 10 appearances in corpus are inculeded in the co-occurrence matrix
    word_counts = defaultdict(int)
    for word in corpus:
        word_counts[word] += 1
    for i, word in enumerate(distinct_words):
        if word_counts[word] >= 5:
            word_to_index[word] = i 
            index_to_word[i] = word
        else:
            del word_to_index[word]
            del index_to_word[i]

    # Rearrange the word to index and index to word mappings
    word_to_index = {word: index for index, word in enumerate(word_to_index.keys())}
    index_to_word = {index: word for word, index in word_to_index.items()}

    # delete the words that are not in the word_to_index mapping
    corpus = [word for word in corpus if word in word_to_index]

    num_words = len(word_to_index)
    print(f"Number of distinct words with at least 5 appearances: {num_words}")

    # Print new corpus length
    print(f"Number of words in corpus after removing words with less than 5 appearances: {len(corpus)}")
    
    # Iterate over the corpus and update the co-occurrence matrix
    iteration_count = 0
    total_iterations = len(corpus) * (2 * window_size - 1)
    print(f"Total iterations: {total_iterations}")
    percent_count = 0

    for i in range(len(corpus)):
        for j in range(max(0, i - window_size), min(i + window_size, len(corpus) - 1) + 1):
            if i != j:
                co_occurrence_matrix[word_to_index[corpus[i]]][word_to_index[corpus[j]]] += 1
                
                iteration_count += 1
                percent_complete = (iteration_count / total_iterations) * 100
                
                if percent_complete >= percent_count:
                    print(f"Iteration: {iteration_count}, Percent Complete: {percent_complete}%")
                    percent_count += 1
                iteration_count += 1
                
    return co_occurrence_matrix, word_to_index, index_to_word

# Example usage:
samplecorpus = ["the", "quick", "brown", "fox", "jumps", "over", "the", "lazy", "dog"]
co_occurrence_matrix, word_to_index, index_to_word = create_co_occurrence_matrix(samplecorpus)
print(co_occurrence_matrix)
print(word_to_index)
print(index_to_word)

Number of distinct words: 8
Number of words in corpus: 9
Number of distinct words with at least 1 appearances: 8
[[0. 1. 1. 1. 1. 1. 1. 2.]
 [1. 0. 1. 1. 1. 1. 1. 2.]
 [1. 1. 0. 1. 1. 1. 1. 2.]
 [1. 1. 1. 0. 1. 1. 1. 2.]
 [1. 1. 1. 1. 0. 1. 1. 2.]
 [1. 1. 1. 1. 1. 0. 1. 2.]
 [1. 1. 1. 1. 1. 1. 0. 2.]
 [2. 2. 2. 2. 2. 2. 2. 2.]]
{'brown': 0, 'dog': 1, 'fox': 2, 'jumps': 3, 'lazy': 4, 'over': 5, 'quick': 6, 'the': 7}
{0: 'brown', 1: 'dog', 2: 'fox', 3: 'jumps', 4: 'lazy', 5: 'over', 6: 'quick', 7: 'the'}


# Generating LSA
and getting word vectors context vectors

In [19]:
# Performing LSA for the whole corpus
context_vectors, word_vectors = perform_lsa(corpus)
len(word_vectors), len(context_vectors)

Number of distinct words: 29634
Number of words in corpus: 953359
Number of distinct words with at least 1 appearances: 29634


KeyboardInterrupt: 

In [None]:
# Save the context vector and word vector to a file
np.save('context_vectors_SemD_Basic.npy', context_vectors)
np.save('word_vectors_SemD_Basic.npy', word_vectors)

In [51]:

word_to_index['the']
# word_vectors[word_to_index['the']]

7

In [21]:
# Saving the co-occurence matrix to a file
# np.save('co_occurrence_matrix.npy', co_occurrence_matrix)

In [None]:
# # Create the co-occurrence matrix
# word_to_index = {}
# index_to_word = []
# co_occurrence_matrix = []
# current_context = []


# # Generating co-occurrence matrix for each context in contexts
# for context in contexts[:1]:
#     context_vector = [0] * len(set(corpus))
#     for w in context:
#         if w not in word_to_index:
#             word_to_index[w] = len(index_to_word)
#             index_to_word.append(w)
#         word_index = word_to_index[w]
#         context_vector[word_index] += 1
#     co_occurrence_matrix.append(context_vector)
# co_occurrence_matrix = np.array(co_occurrence_matrix)
# print(f'Co occurance matrix shape: {co_occurrence_matrix.shape}')
# # Apply log transformation and entropy weighting
# co_occurrence_matrix = np.log(co_occurrence_matrix + 1)
# word_entropies = np.sum(-co_occurrence_matrix * np.log(co_occurrence_matrix), axis=0)
# co_occurrence_matrix = np.divide(co_occurrence_matrix, word_entropies)
# co_occurrence_matrix = np.nan_to_num(co_occurrence_matrix)


Co occurance matrix shape: (1, 29634)


  word_entropies = np.sum(-co_occurrence_matrix * np.log(co_occurrence_matrix), axis=0)
  word_entropies = np.sum(-co_occurrence_matrix * np.log(co_occurrence_matrix), axis=0)


In [None]:
# # Perform singular value decomposition
# u, s, vt = np.linalg.svd(co_occurrence_matrix, full_matrices=False)
# print(f'U shape: {u.shape}')
# print(f'Vt shape: {vt.shape}')
# print(f'S shape: {s.shape}')
# word_vectors = {word: u[i, :300] for i, word in enumerate(index_to_word)} # 300 dimensions is supposedly optimal according to the article.
# context_vectors = vt[:300, :]
# word_vectors = {index_to_word[i]: u[i] for i in range(len(index_to_word))}
# len(context_vectors)

U shape: (806, 806)
Vt shape: (806, 806)
S shape: (806,)


300

In [None]:
# # Perform latent semantic analysis (LSA) on the contexts
# # This will give us word vectors and context vectors
# word_vectors, context_vectors = perform_lsa(corpus, context_length=1000)
# word_vectors
word_to_index['the']

1369

In [195]:
# calculate the semantic diversity of all words in the index of words
semd_values = {}
for word in word_to_index.keys():
    semd_values[word] = calculate_semd(word, word_vectors, context_vectors)
    print(f"Word: {word}, SemD: {semd_values[word]}")


Word: A, SemD: 40.19228816291038
Word: ACT, SemD: 40.19228816291038
Word: ALL, SemD: 40.19228816291038
Word: Adieu, SemD: 40.19228816291038
Word: Aegles, SemD: 40.19228816291038
Word: Against, SemD: 40.19228816291038
Word: Ah, SemD: 40.19228816291038
Word: All, SemD: 40.19228816291038
Word: Am, SemD: 40.19228816291038
Word: Amazon, SemD: 40.19228816291038
Word: Amazons, SemD: 40.19228816291038
Word: An, SemD: 40.19228816291038
Word: And, SemD: 40.19228816291038
Word: Another, SemD: 40.19228816291038
Word: Answer, SemD: 40.19228816291038
Word: Antiopa, SemD: 40.19228816291038
Word: Apollo, SemD: 40.19228816291038
Word: Are, SemD: 40.19228816291038
Word: Ariadne, SemD: 40.19228816291038
Word: As, SemD: 40.19228816291038
Word: At, SemD: 40.19228816291038
Word: Athenian, SemD: 40.19228816291038
Word: Athens, SemD: 40.19228816291038
Word: Attendants, SemD: 40.19228816291038
Word: Awake, SemD: 40.19228816291038
Word: Ay, SemD: 40.19228816291038
Word: BOTTOM, SemD: 40.19228816291038
Word: Bar

KeyboardInterrupt: 

In [None]:

# Calculate semantic diversity (SemD) for each word
semd_values = []

for word in set(corpus):
    semd_value = calculate_semd(word, word_vectors, context_vectors)
    semd_values.append(semd_value)


In [None]:
len(semd_values)

In [None]:
# semd_values = [0 if np.isnan(x) else x for x in semd_values]
semd_values

In [None]:

# Plot the distribution of words as a function of semantic diversity
plt.figure(figsize=(10, 6))
plt.hist(semd_values, bins=30)
plt.xlabel('Semantic Diversity (SemD)')
plt.ylabel('Number of Words')
plt.title('Distribution of Words by Semantic Diversity')
plt.show()