In [6]:
import re
import nltk
import emoji
import numpy as np
from nltk.tokenize import word_tokenize

In [55]:
from utils2 import get_dict




In [57]:
corpus = 'Who ❤️ "word embeddings" in 2020? I do!!!'

Cleaning data with regex.

In [62]:
cleaned_corpus = re.sub(r'[,!?;-]+', '.', corpus)
print(cleaned_corpus)

Who ❤️ "word embeddings" in 2020. I do.


In [63]:
tokenized_corpus = word_tokenize(cleaned_corpus)
print(tokenized_corpus)

['Who', '❤️', '``', 'word', 'embeddings', "''", 'in', '2020', '.', 'I', 'do', '.']


All punctuation changed to '.'



In [64]:
again_clean = [token.lower() for token in tokenized_corpus if token.isalpha() or token == '.' or emoji.get_emoji_regexp().search(token)]

In [66]:
print(again_clean)

['who', '❤️', 'word', 'embeddings', 'in', '.', 'i', 'do', '.']


In [68]:
def tokenize(corpus):
    data = re.sub(r'[,!?;-]+', '.', corpus)
    data = nltk.word_tokenize(data)  # tokenize string to words
    data = [ ch.lower() for ch in data
             if ch.isalpha()
             or ch == '.'
             or emoji.get_emoji_regexp().search(ch)
           ]
    return data

In [70]:
corpus = 'I am happy because I am learning'
print(f'Corpus:  {corpus}')
words = tokenize(corpus)
print(f'Words (tokens):  {words}')

Corpus:  I am happy because I am learning
Words (tokens):  ['i', 'am', 'happy', 'because', 'i', 'am', 'learning']


In [71]:
tokenize("Now it's your turn: try with your own sentence!")

['now', 'it', 'your', 'turn', 'try', 'with', 'your', 'own', 'sentence', '.']

# Sliding window for conitinious bag of words

C represents size of window.
Words represents input already tokenized corpus.
Output of function will yield tuple of center and context words.

In [89]:
def get_windows(words, C):
    for i in range(C, len(words) - C):
        center = words[i]
        context = words[i-C:i] + words[i+1:i+C+1]
        yield context, center 
    

In [90]:
for x, y in get_windows(
            ['i', 'am', 'happy', 'because', 'i', 'am', 'learning'],
            2
        ):
    print(f'{x}\t{y}')

['i', 'am', 'because', 'i']	happy
['am', 'happy', 'i', 'am']	because
['happy', 'because', 'am', 'learning']	i


In [91]:
for x, y in get_windows(tokenize("Now it's your turn: try with your own sentence!"), 1):
    print(f'{x}\t{y}')

['now', 'your']	it
['it', 'turn']	your
['your', 'try']	turn
['turn', 'with']	try
['try', 'your']	with
['with', 'own']	your
['your', 'sentence']	own
['own', '.']	sentence


In [94]:
word2Ind, Ind2word = get_dict(words)

In [96]:
print(word2Ind)

{'am': 0, 'because': 1, 'happy': 2, 'i': 3, 'learning': 4}


In [98]:
print(Ind2word)

{0: 'am', 1: 'because', 2: 'happy', 3: 'i', 4: 'learning'}


Size of vocabulary equals size of created dictionary.

In [105]:
def word_to_one_hot_vector(word, word2Ind, V):
    one_hot_vector = np.zeros(V)
    one_hot_vector[word2Ind[word]] = 1
    return one_hot_vector

In [106]:
context_words = ['i', 'am', 'because', 'i']

In [107]:
V = len(word2Ind)

In [109]:
context_words_vectors = [word_to_one_hot_vector(w, word2Ind, V) for w in context_words]
context_words_vectors


[array([0., 0., 0., 1., 0.]),
 array([1., 0., 0., 0., 0.]),
 array([0., 1., 0., 0., 0.]),
 array([0., 0., 0., 1., 0.])]

In [111]:
np.mean(context_words_vectors, axis=0)

array([0.25, 0.25, 0.  , 0.5 , 0.  ])

In [118]:
def context_words_to_vector(context_words, word2Ind, V):
    return np.mean([word_to_one_hot_vector(context_word, word2Ind, V) for context_word in context_words], axis=0)

In [119]:
context_words_to_vector(['i', 'am', 'because', 'i'], word2Ind, V)

array([0.25, 0.25, 0.  , 0.5 , 0.  ])

In [120]:
def get_training_example(words, C, word2Ind, V):
    for context_words, center_word in get_windows(words, C):
        yield context_words_to_vector(context_words, word2Ind, V), word_to_one_hot_vector(center_word, word2Ind, V)

In [122]:
for context_words_vector, center_word_vector in get_training_example(words, 2, word2Ind, V):
    print(f'Context words vector:  {context_words_vector}')
    print(f'Center word vector:  {center_word_vector}')
    print()

Context words vector:  [0.25 0.25 0.   0.5  0.  ]
Center word vector:  [0. 0. 1. 0. 0.]

Context words vector:  [0.5  0.   0.25 0.25 0.  ]
Center word vector:  [0. 1. 0. 0. 0.]

Context words vector:  [0.25 0.25 0.25 0.   0.25]
Center word vector:  [0. 0. 0. 1. 0.]



# Activation functions

In [125]:
np.random.seed(10)
z_1 = 10*np.random.rand(5, 1)-5
z_1

array([[ 2.71320643],
       [-4.79248051],
       [ 1.33648235],
       [ 2.48803883],
       [-0.01492988]])

In [126]:
h = z_1.copy()

In [127]:
h < 0

array([[False],
       [ True],
       [False],
       [False],
       [ True]])

In [138]:
def relu(z):    
    h = np.copy(z)
    h[h < 0] = 0
    return h

In [139]:
z = np.array([[-1.25459881], [ 4.50714306], [ 2.31993942], [ 0.98658484], [-3.4398136 ]])
relu(z)

array([[0.        ],
       [4.50714306],
       [2.31993942],
       [0.98658484],
       [0.        ]])

In [140]:
def softmax(z):
    numerator = np.exp(z)
    denominator = np.sum(numerator)
    return numerator / denominator

In [141]:
softmax([9, 8, 11, 10, 8.5])

array([0.08276948, 0.03044919, 0.61158833, 0.22499077, 0.05020223])