In [3]:
import numpy as np

corpus = [
    "the cat sat on the mat",
    "the dog sat on the mat",
    "the cat chased the dog",
    "the dog barked at the cat",
    "the dog barked at the cat",
    "the cat meowed back at the dog"
]



# Tokenize

In [6]:
tokens = [sentence.split() for sentence in corpus]
tokens

[['the', 'cat', 'sat', 'on', 'the', 'mat'],
 ['the', 'dog', 'sat', 'on', 'the', 'mat'],
 ['the', 'cat', 'chased', 'the', 'dog'],
 ['the', 'dog', 'barked', 'at', 'the', 'cat'],
 ['the', 'dog', 'barked', 'at', 'the', 'cat'],
 ['the', 'cat', 'meowed', 'back', 'at', 'the', 'dog']]

In [16]:
vocab = sorted(set( word for sentence in tokens for word in sentence))
word_to_ix = {word: i for i, word in enumerate(vocab)}
ix_to_word = {i:word for word, i in word_to_ix.items()}
vocab_size = len(vocab)

print(vocab)

['at', 'back', 'barked', 'cat', 'chased', 'dog', 'mat', 'meowed', 'on', 'sat', 'the']


# Generate target context pairs

In [24]:
def generate_training_data(tokens, window_size=2):
    training_data = []
    for sentence in tokens:
        for i, word in enumerate(sentence):
            target = word_to_ix[word]
            context_range = range(max(0,i - window_size), min(len(sentence), i + window_size + 1))
            for j in context_range:
                if i == j:
                    continue
                training_data.append([target,word_to_ix[sentence[j]]])
    return np.array(training_data)


training_data = generate_training_data(tokens, window_size=2)
print("Sample Data: ", training_data[:5])

Sample Data:  [[10  3]
 [10  9]
 [ 3 10]
 [ 3  9]
 [ 3  8]]
