In [1]:
text = "I had a good sleep yesterday, that's why I feel full of energy. I had my breakfast and I was ready for my journey."

In [2]:
import re

words = re.findall(r"\w+|[^\w\s]+", text)
print(words)

['I', 'had', 'a', 'good', 'sleep', 'yesterday', ',', 'that', "'", 's', 'why', 'I', 'feel', 'full', 'of', 'energy', '.', 'I', 'had', 'my', 'breakfast', 'and', 'I', 'was', 'ready', 'for', 'my', 'journey', '.']


In [3]:
vocab = set(words)
print(f"the vocabulary size : {len(vocab)}")
print(f"vocab : {vocab}")

the vocabulary size : 23
vocab : {'full', 'sleep', 's', 'was', 'feel', 'journey', 'ready', 'a', 'and', 'my', 'breakfast', 'had', ',', "'", '.', 'energy', 'why', 'I', 'good', 'that', 'for', 'yesterday', 'of'}


In [4]:
word_to_id = {word:idx for idx,word in enumerate(vocab)}

In [5]:
vocab = list(vocab)
print(f"vocab as a list : {vocab}")

vocab as a list : ['full', 'sleep', 's', 'was', 'feel', 'journey', 'ready', 'a', 'and', 'my', 'breakfast', 'had', ',', "'", '.', 'energy', 'why', 'I', 'good', 'that', 'for', 'yesterday', 'of']


In [6]:
word_to_id = {word:idx for idx,word in enumerate(vocab)}
id_to_word = {id:word for word,id in word_to_id.items()}
print(f"word to id mappings : {word_to_id}")

word to id mappings : {'full': 0, 'sleep': 1, 's': 2, 'was': 3, 'feel': 4, 'journey': 5, 'ready': 6, 'a': 7, 'and': 8, 'my': 9, 'breakfast': 10, 'had': 11, ',': 12, "'": 13, '.': 14, 'energy': 15, 'why': 16, 'I': 17, 'good': 18, 'that': 19, 'for': 20, 'yesterday': 21, 'of': 22}


In [7]:
corpus = [word_to_id[word] for word in words]
print(f"corpus : {corpus}")

corpus : [17, 11, 7, 18, 1, 21, 12, 19, 13, 2, 16, 17, 4, 0, 22, 15, 14, 17, 11, 9, 10, 8, 17, 3, 6, 20, 9, 5, 14]


In [8]:
import numpy as np

cooccur_mat = np.zeros((len(vocab),len(vocab)))
print(f"cooccurrence matrix shape : {cooccur_mat.shape}")

cooccurrence matrix shape : (23, 23)


In [9]:
window_size = 2

for word_pos,word_id in enumerate(corpus):
    for context_idx in range(1, window_size+1):
        left_word_pos = word_pos - context_idx
        right_word_pos = word_pos + context_idx

        if left_word_pos > 0:
            left_word_id = corpus[left_word_pos]
            cooccur_mat[word_id, left_word_id] += 1
        
        if right_word_pos < len(corpus):
            right_word_id = corpus[right_word_pos]
            cooccur_mat[word_id, right_word_id] += 1

In [10]:
print(f"cooccurrence matrix : {cooccur_mat}")

cooccurrence matrix : [[0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 1. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 1. 0. 1. 0. 1. 1. 0. 0. 1. 0. 0. 1. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 1. 0. 0. 0. 1. 0. 0. 1. 1. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 1. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0

In [11]:
print(f"vector representation of word I : {cooccur_mat[word_to_id["I"]]}")
print(f"id of word had : {word_to_id['had']}")
print(f"id of word a : {word_to_id['a']}")


vector representation of word I : [1. 0. 1. 1. 1. 0. 1. 1. 1. 1. 1. 2. 0. 0. 1. 1. 1. 0. 0. 0. 0. 0. 0.]
id of word had : 11
id of word a : 7


In [12]:
print(f"vector representation of word good : {cooccur_mat[word_to_id["good"]]}")
print(f"id of word a : {word_to_id['a']}")
print(f"id of word sleep : {word_to_id['sleep']}")

vector representation of word good : [0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
id of word a : 7
id of word sleep : 1
