Based on the paper:
Paul Hoffman, Matthew A. Lambon Ralph, and Timothy T. Rogers, “Semantic Diversity: A Measure of Semantic Ambiguity Based on Variability in the Contextual Usage of Words,” Behavior Research Methods 45, no. 3 (September 1, 2013): 718–30, https://doi.org/10.3758/s13428-012-0278-x.

In [141]:
# !pip install -r requirements.txt

In [142]:
import numpy as np
import os
import matplotlib.pyplot as plt
from nltk.tokenize import word_tokenize
from transformers import BertTokenizer, BertModel
import torch
import re


In [143]:
def perform_lsa(corpus, context_length=1000):
    """
    Perform latent semantic analysis on the given corpus.
    
    Args:
        corpus (list of lists): The corpus of text, where each inner list represents a context.
        context_length (int): The length of each context in words. Default is 1,000.
    
    Returns:
        word_vectors (dict): A dictionary mapping words to their 300-dimensional LSA vectors.
        context_vectors (list): A list of 300-dimensional LSA vectors, one for each context.
    """
    # Create the co-occurrence matrix
    word_to_index = {}
    index_to_word = []
    co_occurrence_matrix = []
    current_context = []
    for word in corpus:
        current_context.append(word)
        if len(current_context) == context_length:
            context_vector = [0] * len(set(corpus))
            for w in current_context:
                if w not in word_to_index:
                    word_to_index[w] = len(index_to_word)
                    index_to_word.append(w)
                word_index = word_to_index[w]
                context_vector[word_index] += 1
            co_occurrence_matrix.append(context_vector)
            current_context = []
    if current_context:
        context_vector = [0] * len(set(corpus))
        for w in current_context:
            if w not in word_to_index:
                word_to_index[w] = len(index_to_word)
                index_to_word.append(w)
            word_index = word_to_index[w]
            context_vector[word_index] += 1
        co_occurrence_matrix.append(context_vector)
    co_occurrence_matrix = np.array(co_occurrence_matrix)
    
    # Apply log transformation and entropy weighting
    co_occurrence_matrix = np.log(co_occurrence_matrix + 1)
    word_entropies = np.sum(-co_occurrence_matrix * np.log(co_occurrence_matrix), axis=0)
    co_occurrence_matrix = np.divide(co_occurrence_matrix, word_entropies)
    co_occurrence_matrix = np.nan_to_num(co_occurrence_matrix)
    
    # Perform singular value decomposition
    u, s, vt = np.linalg.svd(co_occurrence_matrix, full_matrices=False)
    word_vectors = {word: u[i, :300] for i, word in enumerate(index_to_word)} # 300 dimensions is supposedly optimal according to the article.
    context_vectors = vt[:300, :]
    
    return word_vectors, context_vectors

def calculate_semd(word, word_vectors, context_vectors):
    """
    Calculate the semantic diversity (SemD) of the given word.
    
    Args:
        word (str): The word for which to calculate SemD.
        word_vectors (dict): A dictionary mapping words to their 300-dimensional LSA vectors.
        context_vectors (list): A list of 300-dimensional LSA vectors, one for each context.
    
    Returns:
        semd (float): The semantic diversity value for the word.
    """
    # Find all contexts containing the word
    word_contexts = [i for i in context_vectors if word in word_vectors]
    # if len(word_contexts) > 2000:
    #     word_contexts = np.random.choice(word_contexts, size=2000, replace=False)

    # Calculate the average cosine similarity between the contexts
    context_similarities = [1 - cosine(word_contexts[i], word_contexts[j]) for i in range(len(word_contexts)) for j in range(len(word_contexts)) if i < j]
    mean_similarity = np.mean(context_similarities)

    # Calculate the SemD value
    semd = -np.log(mean_similarity)
    
    return semd

def prepare_text(text):
    # text = "[CLS] " + text + " [SEP]"
    
    tokenized_text = tokenizer.tokenize(text)
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

    segments_ids = torch.ones((1, len(indexed_tokens)), dtype=torch.long)
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensor = segments_ids.clone()
    return tokenized_text, tokens_tensor, segments_tensor

In [144]:
corpus = []

directory = '../Text data/'
for file in os.listdir(directory):
    if file.endswith('.txt'):
        with open(os.path.join(directory, file), 'r') as f:
            content = f.read().replace('\n', ' ')
            # Remove website links from the corpus
            corpus_without_links = [re.sub(r'http[s]?://\S+', '', text) for text in corpus]
            # content = tokenizer.tokenize(content) # Bert Tokenizer  will not preserve the original words
            content = word_tokenize(content) # NLTK Tokenizer will preserve the original words, but I'm not sure if they can be used with BERT
            corpus.extend(content)


print(f"Length of Corpus: {len(corpus)}")
corpus[:20]

Length of Corpus: 1190921


['A',
 'Midsummer',
 'Night',
 "'s",
 'Dream',
 'by',
 'William',
 'Shakespeare',
 'Edited',
 'by',
 'Barbara',
 'A.',
 'Mowat',
 'and',
 'Paul',
 'Werstine',
 'with',
 'Michael',
 'Poston',
 'and']

In [145]:
# Split corpus into contexts of length context_length
context_length = 100
current_context = []
contexts = []

for word in corpus:
    current_context.append(word)
    if len(current_context) == context_length:
        contexts.append(current_context)
        current_context = []

if current_context:
    contexts.append(current_context)

In [181]:
import numpy as np
from collections import defaultdict

def create_co_occurrence_matrix(corpus, window_size=1):
    distinct_words = sorted(list(set(corpus)))
    num_words = len(distinct_words)
    
    # Create word to index and index to word mappings
    word_to_index = {word: index for index, word in enumerate(distinct_words)}
    index_to_word = {index: word for index, word in enumerate(distinct_words)}
    
    # Create an empty co-occurrence matrix
    co_occurrence_matrix = np.zeros((num_words, num_words))
    
    # Iterate over the corpus and update the co-occurrence matrix
    for i in range(len(corpus)):
        for j in range(max(0, i - window_size), min(i + window_size, len(corpus) - 1) + 1):
            if i != j:
                co_occurrence_matrix[word_to_index[corpus[i]]][word_to_index[corpus[j]]] += 1
                
    return co_occurrence_matrix, word_to_index, index_to_word

# Example usage:
corpus = ["the", "quick", "brown", "fox", "jumps", "over", "the", "lazy", "dog"]
co_occurrence_matrix, word_to_index, index_to_word = create_co_occurrence_matrix(corpus, window_size=2)

In [133]:
# Load the pre-trained BERT model and tokenizer
model = BertModel.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [104]:
def get_embedding(contexts):


    all_context_embeddings = torch.tensor([])
    all_word_embeddings = torch.tensor([])

    for context in contexts:
        # Tokenize the input text
        encoded_input = tokenizer(context, return_tensors='pt', padding=True, truncation=True)
        
        # Pass the input through the BERT model
        with torch.no_grad():
            output = model(**encoded_input)
        
        # Extract the context embedding from the output
        context_embedding = output.pooler_output
        word_embeddings = output.last_hidden_state

        print(word_embeddings.shape)

        all_context_embeddings = torch.cat((all_context_embeddings, context_embedding.unsqueeze(0).detach()), dim=0)
        all_word_embeddings = torch.cat((all_word_embeddings, word_embeddings.detach()), dim=0)

        print(f'Context {i + 1} out of {len(contexts[:10])} completed')
        
    # removing embeddings for the tokens [CLS] and [SEP]
    all_word_embeddings = all_word_embeddings[:, 1:-1, :]

    
    return all_context_embeddings, all_word_embeddings


In [109]:

# Example list of contexts
sample_contexts = contexts[:1]

all_context_embeddings = torch.tensor([])
all_word_embeddings = torch.tensor([])

all_context_embeddings, all_word_embeddings = get_embedding(sample_contexts)

all_context_embeddings.shape, all_word_embeddings.shape


torch.Size([100, 28, 768])
Context 2 out of 1 completed


(torch.Size([1, 100, 768]), torch.Size([100, 26, 768]))

In [180]:
# Create the co-occurrence matrix
word_to_index = {}
index_to_word = []
co_occurrence_matrix = []
current_context = []
# for word in corpus:
#     current_context.append(word)
#     if len(current_context) == context_length:
#         context_vector = [0] * len(set(corpus))
#         for w in current_context:
#             if w not in word_to_index:
#                 word_to_index[w] = len(index_to_word)
#                 index_to_word.append(w)
#             word_index = word_to_index[w]
#             context_vector[word_index] += 1
#         co_occurrence_matrix.append(context_vector)
#         current_context = []
# if current_context:
#     context_vector = [0] * len(set(corpus))
#     for w in current_context:
#         if w not in word_to_index:
#             word_to_index[w] = len(index_to_word)
#             index_to_word.append(w)
#         word_index = word_to_index[w]
#         context_vector[word_index] += 1
#     co_occurrence_matrix.append(context_vector)
# co_occurrence_matrix = np.array(co_occurrence_matrix)

# Generating co-occurrence matrix for each context in contexts
for context in contexts[:1]:
    context_vector = [0] * len(set(corpus))
    for w in context:
        if w not in word_to_index:
            word_to_index[w] = len(index_to_word)
            index_to_word.append(w)
        word_index = word_to_index[w]
        context_vector[word_index] += 1
    co_occurrence_matrix.append(context_vector)
co_occurrence_matrix = np.array(co_occurrence_matrix)
print(f'Co occurance matrix shape: {co_occurrence_matrix.shape}')
# Apply log transformation and entropy weighting
co_occurrence_matrix = np.log(co_occurrence_matrix + 1)
word_entropies = np.sum(-co_occurrence_matrix * np.log(co_occurrence_matrix), axis=0)
co_occurrence_matrix = np.divide(co_occurrence_matrix, word_entropies)
co_occurrence_matrix = np.nan_to_num(co_occurrence_matrix)


Co occurance matrix shape: (1, 34157)


  word_entropies = np.sum(-co_occurrence_matrix * np.log(co_occurrence_matrix), axis=0)
  word_entropies = np.sum(-co_occurrence_matrix * np.log(co_occurrence_matrix), axis=0)


In [179]:
for i in range(len(co_occurrence_matrix)):
    # Perform singular value decomposition
    u, s, vt = np.linalg.svd(co_occurrence_matrix[i], full_matrices=False)
    print(f'U shape: {u.shape}')
    print(f'Vt shape: {vt.shape}')
    print(f'S shape: {s.shape}')
# word_vectors = {word: u[i, :300] for i, word in enumerate(index_to_word)} # 300 dimensions is supposedly optimal according to the article.
# context_vectors = vt[:300, :]
# word_vectors = {index_to_word[i]: u[i] for i in range(len(index_to_word))}
u.shape
# word_to_index['the']

LinAlgError: 1-dimensional array given. Array must be at least two-dimensional

In [153]:
len(co_occurrence_matrix), len(co_occurrence_matrix[0])

(1, 34157)

In [None]:
# Perform latent semantic analysis (LSA) on the contexts
# This will give us word vectors and context vectors
word_vectors, context_vectors = perform_lsa(corpus, context_length=1000)
word_vectors

In [None]:
word = 'man'
word_contexts = [i for i in context_vectors if word in word_vectors]
# if len(word_contexts) > 2000:
#     word_contexts = np.random.choice(word_contexts, size=2000, replace=False)

# Calculate the average cosine similarity between the contexts
context_similarities = [1 - cosine(word_contexts[i], word_contexts[j]) for i in range(len(word_contexts)) for j in range(len(word_contexts)) if i < j]
mean_similarity = np.mean(context_similarities)

# Calculate the SemD value
semd = -np.log(mean_similarity)
context_similarities

In [None]:

# Calculate semantic diversity (SemD) for each word
semd_values = []

for word in set(corpus):
    semd_value = calculate_semd(word, word_vectors, context_vectors)
    semd_values.append(semd_value)


In [None]:
len(semd_values)

In [None]:
# semd_values = [0 if np.isnan(x) else x for x in semd_values]
semd_values

In [None]:

# Plot the distribution of words as a function of semantic diversity
plt.figure(figsize=(10, 6))
plt.hist(semd_values, bins=30)
plt.xlabel('Semantic Diversity (SemD)')
plt.ylabel('Number of Words')
plt.title('Distribution of Words by Semantic Diversity')
plt.show()