Based on the paper:
Paul Hoffman, Matthew A. Lambon Ralph, and Timothy T. Rogers, “Semantic Diversity: A Measure of Semantic Ambiguity Based on Variability in the Contextual Usage of Words,” Behavior Research Methods 45, no. 3 (September 1, 2013): 718–30, https://doi.org/10.3758/s13428-012-0278-x.

In [1]:
import numpy as np
import os
import matplotlib.pyplot as plt
from nltk.tokenize import word_tokenize
from transformers import BertTokenizer, BertModel
import torch


In [2]:
def perform_lsa(corpus, context_length=1000):
    """
    Perform latent semantic analysis on the given corpus.
    
    Args:
        corpus (list of lists): The corpus of text, where each inner list represents a context.
        context_length (int): The length of each context in words. Default is 1,000.
    
    Returns:
        word_vectors (dict): A dictionary mapping words to their 300-dimensional LSA vectors.
        context_vectors (list): A list of 300-dimensional LSA vectors, one for each context.
    """
    # Create the co-occurrence matrix
    word_to_index = {}
    index_to_word = []
    co_occurrence_matrix = []
    current_context = []
    for word in corpus:
        current_context.append(word)
        if len(current_context) == context_length:
            context_vector = [0] * len(set(corpus))
            for w in current_context:
                if w not in word_to_index:
                    word_to_index[w] = len(index_to_word)
                    index_to_word.append(w)
                word_index = word_to_index[w]
                context_vector[word_index] += 1
            co_occurrence_matrix.append(context_vector)
            current_context = []
    if current_context:
        context_vector = [0] * len(set(corpus))
        for w in current_context:
            if w not in word_to_index:
                word_to_index[w] = len(index_to_word)
                index_to_word.append(w)
            word_index = word_to_index[w]
            context_vector[word_index] += 1
        co_occurrence_matrix.append(context_vector)
    co_occurrence_matrix = np.array(co_occurrence_matrix)
    
    # Apply log transformation and entropy weighting
    co_occurrence_matrix = np.log(co_occurrence_matrix + 1)
    word_entropies = np.sum(-co_occurrence_matrix * np.log(co_occurrence_matrix), axis=0)
    co_occurrence_matrix = np.divide(co_occurrence_matrix, word_entropies)
    co_occurrence_matrix = np.nan_to_num(co_occurrence_matrix)
    
    # Perform singular value decomposition
    u, s, vt = np.linalg.svd(co_occurrence_matrix, full_matrices=False)
    word_vectors = {word: u[i, :300] for i, word in enumerate(index_to_word)} # 300 dimensions is supposedly optimal according to the article.
    context_vectors = vt[:300, :]
    
    return word_vectors, context_vectors

def calculate_semd(word, word_vectors, context_vectors):
    """
    Calculate the semantic diversity (SemD) of the given word.
    
    Args:
        word (str): The word for which to calculate SemD.
        word_vectors (dict): A dictionary mapping words to their 300-dimensional LSA vectors.
        context_vectors (list): A list of 300-dimensional LSA vectors, one for each context.
    
    Returns:
        semd (float): The semantic diversity value for the word.
    """
    # Find all contexts containing the word
    word_contexts = [i for i in context_vectors if word in word_vectors]
    # if len(word_contexts) > 2000:
    #     word_contexts = np.random.choice(word_contexts, size=2000, replace=False)

    # Calculate the average cosine similarity between the contexts
    context_similarities = [1 - cosine(word_contexts[i], word_contexts[j]) for i in range(len(word_contexts)) for j in range(len(word_contexts)) if i < j]
    mean_similarity = np.mean(context_similarities)

    # Calculate the SemD value
    semd = -np.log(mean_similarity)
    
    return semd

In [3]:
corpus = []

directory = '../Text data/'
for file in os.listdir(directory):
    if file.endswith('.txt'):
        with open(os.path.join(directory, file), 'r') as f:
            content = f.read().replace('\n', ' ')
            corpus.extend(content.split())


len(corpus)

962134

In [4]:
# Split corpus into contexts of length context_length
context_length = 1000
current_context = []
contexts = []

for word in corpus:
    current_context.append(word)
    if len(current_context) == context_length:
        contexts.append(current_context)
        current_context = []

if current_context:
    contexts.append(current_context)

In [5]:
# Load the pre-trained BERT model and tokenizer
model = BertModel.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [6]:
def get_context_embedding(texts):
    embeddings = torch.tensor([])
    
    for text in texts:
        # Tokenize the input text
        encoded_input = tokenizer(text, return_tensors='pt', padding=True, truncation=True)
        
        # Pass the input through the BERT model
        output = model(**encoded_input)
        
        # Extract the sentence embedding from the output
        context_embedding = output.pooler_output
        
        embeddings = torch.cat((embeddings, context_embedding.unsqueeze(0).detach()), dim=0)
    
    return embeddings


In [7]:
# Get the BERT embeddings for all contexts. Saving the embeddings in a file in batches so that the computer doesn't run out of memory.
batch_size = 1
all_context_embeddings = torch.tensor([])
for i in range(0, len(contexts), batch_size):
    batch_contexts = contexts[i:i + batch_size]
    batch_context_embeddings = get_context_embedding(batch_contexts)
    all_context_embeddings = torch.cat((all_context_embeddings, batch_context_embeddings), dim=0)
    print(f'Batch {i + 1} out of {len(contexts)} completed')



Batch 1 out of 963 completed
Batch 2 out of 963 completed
Batch 3 out of 963 completed
Batch 4 out of 963 completed
Batch 5 out of 963 completed
Batch 6 out of 963 completed
Batch 7 out of 963 completed
Batch 8 out of 963 completed
Batch 9 out of 963 completed
Batch 10 out of 963 completed
Batch 11 out of 963 completed
Batch 12 out of 963 completed
Batch 13 out of 963 completed
Batch 14 out of 963 completed
Batch 15 out of 963 completed
Batch 16 out of 963 completed
Batch 17 out of 963 completed
Batch 18 out of 963 completed
Batch 19 out of 963 completed
Batch 20 out of 963 completed
Batch 21 out of 963 completed
Batch 22 out of 963 completed
Batch 23 out of 963 completed
Batch 24 out of 963 completed
Batch 25 out of 963 completed
Batch 26 out of 963 completed
Batch 27 out of 963 completed
Batch 28 out of 963 completed
Batch 29 out of 963 completed
Batch 30 out of 963 completed
Batch 31 out of 963 completed
Batch 32 out of 963 completed
Batch 33 out of 963 completed
Batch 34 out of 963

In [1]:

all_context_embeddings.shape

# save all context embeddings in a file
torch.save(all_context_embeddings, 'all_context_embeddings.pt')

NameError: name 'all_context_embeddings' is not defined

In [None]:
current_context = []
word_to_index = {}
index_to_word = []
co_occurrence_matrix = []
# co_occurrence_matrix = np.zeros((len(set(corpus)), len(set(corpus))))




# for word in corpus:
#     current_context.append(word)
#     if len(current_context) == context_length:
#         context_vector = [0] * len(set(corpus))
#         for w in current_context:
#             if w not in word_to_index:
#                 word_to_index[w] = len(index_to_word)
#                 index_to_word.append(w)
#             word_index = word_to_index[w]
#             context_vector[word_index] += 1
#         contexts.append(current_context)
#         co_occurrence_matrix.append(context_vector)
#         current_context = []
# if current_context:
#         context_vector = [0] * len(set(corpus))
#         for w in current_context:
#             if w not in word_to_index:
#                 word_to_index[w] = len(index_to_word)
#                 index_to_word.append(w)
#             word_index = word_to_index[w]
#             context_vector[word_index] += 1
#         co_occurrence_matrix.append(context_vector)
# co_occurrence_matrix = np.array(co_occurrence_matrix)
# co_occurrence_matrix[0]
# # Apply log transformation and entropy weighting
# co_occurrence_matrix = np.log(co_occurrence_matrix + 1)
# word_entropies = np.sum(-co_occurrence_matrix * np.log(co_occurrence_matrix), axis=0)
# co_occurrence_matrix = np.divide(co_occurrence_matrix, word_entropies)
# co_occurrence_matrix = np.nan_to_num(co_occurrence_matrix)

# for context in co_occurrence_matrix:
    
#     # Perform singular value decomposition
#     u[context], s[context], vt[context] = np.linalg.svd(co_occurrence_matrix, full_matrices=False)

#     word_vectors = {word: u[i, :300] for i, word in enumerate(index_to_word)} # 300 dimensions is supposedly optimal according to the article.


In [None]:
# Perform latent semantic analysis (LSA) on the contexts
# This will give us word vectors and context vectors
word_vectors, context_vectors = perform_lsa(corpus, context_length=1000)
word_vectors

In [None]:
word = 'man'
word_contexts = [i for i in context_vectors if word in word_vectors]
# if len(word_contexts) > 2000:
#     word_contexts = np.random.choice(word_contexts, size=2000, replace=False)

# Calculate the average cosine similarity between the contexts
context_similarities = [1 - cosine(word_contexts[i], word_contexts[j]) for i in range(len(word_contexts)) for j in range(len(word_contexts)) if i < j]
mean_similarity = np.mean(context_similarities)

# Calculate the SemD value
semd = -np.log(mean_similarity)
context_similarities

In [None]:

# Calculate semantic diversity (SemD) for each word
semd_values = []

for word in set(corpus):
    semd_value = calculate_semd(word, word_vectors, context_vectors)
    semd_values.append(semd_value)


In [None]:
len(semd_values)

In [None]:
# semd_values = [0 if np.isnan(x) else x for x in semd_values]
semd_values

In [None]:

# Plot the distribution of words as a function of semantic diversity
plt.figure(figsize=(10, 6))
plt.hist(semd_values, bins=30)
plt.xlabel('Semantic Diversity (SemD)')
plt.ylabel('Number of Words')
plt.title('Distribution of Words by Semantic Diversity')
plt.show()