# Word context

In [2]:
import numpy as np
from collections import defaultdict
from nltk.tokenize import word_tokenize
from sklearn.metrics.pairwise import cosine_similarity

In [5]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/sofarooq/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [6]:
def generate_word_context_matrix(corpus, window_size=2):
    word_context = defaultdict(lambda: defaultdict(int))
    vocabulary = set()
    
    # Tokenize the corpus
    tokens = word_tokenize(corpus.lower())
    
    # Build word-context matrix
    for i, word in enumerate(tokens):
        for j in range(max(i - window_size, 0), min(i + window_size + 1, len(tokens))):
            if i != j:
                context_word = tokens[j]
                word_context[word][context_word] += 1
                vocabulary.add(word)
                vocabulary.add(context_word)
                
    # Convert the word-context matrix to a numpy array
    vocab_list = sorted(vocabulary)
    matrix = np.zeros((len(vocab_list), len(vocab_list)))
    for i, word in enumerate(vocab_list):
        for j, context_word in enumerate(vocab_list):
            matrix[i][j] = word_context[word][context_word]
    
    return matrix, vocab_list

def similarity(word1, word2, word_context_matrix, vocab_list):
    if word1 not in vocab_list or word2 not in vocab_list:
        return "One or both words not found in the vocabulary."
    
    index1 = vocab_list.index(word1)
    index2 = vocab_list.index(word2)
    
    vec1 = word_context_matrix[index1].reshape(1, -1)
    vec2 = word_context_matrix[index2].reshape(1, -1)
    
    similarity_score = cosine_similarity(vec1, vec2)[0][0]
    
    return similarity_score

# Example usage:
corpus = "This is a simple example sentence for generating word context matrix."
window_size = 2

matrix, vocab_list = generate_word_context_matrix(corpus, window_size)
word1 = "simple"
word2 = "sentence"

print("Word Context Matrix:")
print(matrix)
print("Vocabulary List:")
print(vocab_list)

print("\nSimilarity between '{}' and '{}' is: {}".format(word1, word2, similarity(word1, word2, matrix, vocab_list)))

Word Context Matrix:
[[0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 1. 0. 0. 1. 1. 0.]
 [1. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 1.]
 [0. 1. 0. 0. 1. 0. 0. 0. 1. 1. 0. 0.]
 [0. 0. 0. 1. 0. 1. 0. 0. 1. 0. 0. 1.]
 [0. 0. 1. 0. 1. 0. 0. 0. 1. 0. 0. 1.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0.]
 [1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 1. 1. 1. 0. 0. 0. 1. 0. 0.]
 [0. 1. 0. 1. 0. 0. 1. 0. 1. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 1. 1. 0. 1. 0. 0. 0. 0.]]
Vocabulary List:
['.', 'a', 'context', 'example', 'for', 'generating', 'is', 'matrix', 'sentence', 'simple', 'this', 'word']

Similarity between 'simple' and 'sentence' is: 0.25
