### This notebook deals with Data Preprocessing for word embeddings using Neural networks

In [3]:
#Import the libraries
import numpy as np
import re
import nltk
import emoji
from nltk.tokenize import word_tokenize
from utils import get_dict

#### Cleaning and tokenize the data

In [4]:
#Define a sample sentence
corpus = 'Who ❤️ "word embeddings" in 2020? I do!!!'

Replace the punctuations with the period for simplicity

In [5]:
#Clean the data
print(f'Corpus: {corpus}')

#Replace punctuations
data = re.sub(r'[,!?;-]+', '.', corpus)

print("Cleaned data: ", data)

Corpus: Who ❤️ "word embeddings" in 2020? I do!!!
Cleaned data:  Who ❤️ "word embeddings" in 2020. I do.


In [6]:
#Tokenize the data
print("Before tokenization: ", data)

tokenized_data = word_tokenize(data)

print("Tokenized data: ", tokenized_data)

Before tokenization:  Who ❤️ "word embeddings" in 2020. I do.
Tokenized data:  ['Who', '❤️', '``', 'word', 'embeddings', "''", 'in', '2020', '.', 'I', 'do', '.']


In [7]:
#Convert the data to lower case
data_cleaned = [ch.lower() for ch in tokenized_data
               if ch.isalpha() or
               ch == '.' or
               emoji.get_emoji_regexp().search(ch)
               ]

print("Cleaned data: ", data_cleaned)

Cleaned data:  ['who', '❤️', 'word', 'embeddings', 'in', '.', 'i', 'do', '.']


In [10]:
#Create a function to perform all the cleaning and tokenization task
def tokenize(corpus):
    
    data = re.sub(r'!,-;?]+', '.', corpus)
    data_tokenized = word_tokenize(data)
    data_cleaned = [ch.lower() for ch in data_tokenized
                   if ch.isalpha() or
                   ch == '.' or
                   emoji.get_emoji_regexp().search(ch)
                   ]
    
    return data_cleaned

In [13]:
#Test the function
corpus = 'I am happy because I am learning'

# Print new corpus
print(f'Corpus:  {corpus}')

# Save tokenized version of corpus into 'words' variable
words = tokenize(corpus)

# Print the tokenized version of the corpus
print(f'Words (tokens):  {words}')

Corpus:  I am happy because I am learning.
Words (tokens):  ['i', 'am', 'happy', 'because', 'i', 'am', 'learning', '.']


In [15]:
#Test the function onto custom sentence
cust_sentence = "I love NLP!!! Thank you Coursera for this.. Do you have any other course??"

print("Original sentence: ", cust_sentence)
cust_sentence_processed = tokenize(cust_sentence)
print("Pre processed data: ", cust_sentence_processed)

Original sentence:  I love NLP!!! Thank you Coursera for this.. Do you have any other course??
Pre processed data:  ['i', 'love', 'nlp', 'thank', 'you', 'coursera', 'for', 'do', 'you', 'have', 'any', 'other', 'course']


### Sliding window of word: Get the context of the centered word

In [16]:
#Define the function to get the windows
def get_windows(words, c):
    i = c
    while i <= len(words) - c:
        center_word = words[i]
        context_words = words[(i-c):i] + words[i+1:i+c+1]
        yield context_words, center_word
        i += 1

In [17]:
#Test the function
for x, y in get_windows(['i', 'am', 'happy', 'because', 'i', 'am', 'learning'], 2):
    print(f'{x}\t{y}')

['i', 'am', 'because', 'i']	happy
['am', 'happy', 'i', 'am']	because
['happy', 'because', 'am', 'learning']	i
['because', 'i', 'learning']	am


In [18]:
# Print 'context_words' and 'center_word' for any sentence with a 'context half-size' of 1
for x, y in get_windows(tokenize("I love NLP!!! Thank you Coursera for this.. Do you have any other course??"), 1):
    print(f'{x}\t{y}')

['i', 'nlp']	love
['love', 'thank']	nlp
['nlp', 'you']	thank
['thank', 'coursera']	you
['you', 'for']	coursera
['coursera', 'do']	for
['for', 'you']	do
['do', 'have']	you
['you', 'any']	have
['have', 'other']	any
['any', 'course']	other
['other']	course


### Transform the words into vectors

In [19]:
#Function to map the words to index and index to words

def get_dict(data):
    words = sorted(list(set(data)))
    n = len(words)
    idx = 0
    # return these correctly
    word2Ind = {}
    Ind2word = {}
    for k in words:
        word2Ind[k] = idx
        Ind2word[idx] = k
        idx += 1
    return word2Ind, Ind2word

In [20]:
#Test the function
word2Ind, Ind2word = get_dict(words)

In [21]:
print("Word to index: ", word2Ind)

Word to index:  {'.': 0, 'am': 1, 'because': 2, 'happy': 3, 'i': 4, 'learning': 5}


In [23]:
print("Index to word: ", Ind2word)

Index to word:  {0: '.', 1: 'am', 2: 'because', 3: 'happy', 4: 'i', 5: 'learning'}


In [24]:
print("Length of vocabulary: ", len(word2Ind))

Length of vocabulary:  6


### Get the one-hot word vectors

In [25]:
print('happy: ', word2Ind['happy'])

happy:  3


In [26]:
#Create an empty vector
n = len(word2Ind)
center_word_vector = np.zeros(n)

idx = word2Ind['happy']
center_word_vector[idx] = 1

print("One-hot word vector of happy: ", center_word_vector)

One-hot word vector of happy:  [0. 0. 0. 1. 0. 0.]


In [27]:
#Define a function to get the one-hot word vector
def word_to_one_hot_vector(word, word2Ind, n):
    one_hot_vector = np.zeros(n)
    one_hot_vector[word2Ind[word]] = 1
    return one_hot_vector

In [28]:
#Test the function
word_to_one_hot_vector('happy', word2Ind, n)

array([0., 0., 0., 1., 0., 0.])

### Getting context word vectors

In [29]:
#Define a list of context words
context_words = ['i', 'am', 'because', 'i']

In [31]:
# Create one-hot vectors for each context word using list comprehension
context_words_vectors = [word_to_one_hot_vector(w, word2Ind, n) for w in context_words]

# Print one-hot vectors for each context word
context_words_vectors

[array([0., 0., 0., 0., 1., 0.]),
 array([0., 1., 0., 0., 0., 0.]),
 array([0., 0., 1., 0., 0., 0.]),
 array([0., 0., 0., 0., 1., 0.])]

In [32]:
#Calculate the mean
np.mean(context_words_vectors, axis=0)

array([0.  , 0.25, 0.25, 0.  , 0.5 , 0.  ])

In [33]:
# Define the function that will include the steps previously seen
def context_words_to_vector(context_words, word2Ind, V):
    context_words_vectors = [word_to_one_hot_vector(w, word2Ind, V) for w in context_words]
    context_words_vectors = np.mean(context_words_vectors, axis=0)
    return context_words_vectors

In [35]:
#Test the function
context_words_to_vector(['i', 'am', 'because', 'i'], word2Ind, n)

array([0.  , 0.25, 0.25, 0.  , 0.5 , 0.  ])

### Build training set

In [36]:
print(words)

['i', 'am', 'happy', 'because', 'i', 'am', 'learning', '.']


In [38]:
# Print vectors associated to center and context words for corpus
for context_words, center_word in get_windows(words, 2):  # reminder: 2 is the context half-size
    print(f'Context words:  {context_words} -> {context_words_to_vector(context_words, word2Ind, n)}')
    print(f'Center word:  {center_word} -> {word_to_one_hot_vector(center_word, word2Ind, n)}')
    print()

Context words:  ['i', 'am', 'because', 'i'] -> [0.   0.25 0.25 0.   0.5  0.  ]
Center word:  happy -> [0. 0. 0. 1. 0. 0.]

Context words:  ['am', 'happy', 'i', 'am'] -> [0.   0.5  0.   0.25 0.25 0.  ]
Center word:  because -> [0. 0. 1. 0. 0. 0.]

Context words:  ['happy', 'because', 'am', 'learning'] -> [0.   0.25 0.25 0.25 0.   0.25]
Center word:  i -> [0. 0. 0. 0. 1. 0.]

Context words:  ['because', 'i', 'learning', '.'] -> [0.25 0.   0.25 0.   0.25 0.25]
Center word:  am -> [0. 1. 0. 0. 0. 0.]

Context words:  ['i', 'am', '.'] -> [0.33333333 0.33333333 0.         0.         0.33333333 0.        ]
Center word:  learning -> [0. 0. 0. 0. 0. 1.]



In [39]:
# Define the generator function 'get_training_example'
def get_training_example(words, C, word2Ind, V):
    for context_words, center_word in get_windows(words, C):
        yield context_words_to_vector(context_words, word2Ind, V), word_to_one_hot_vector(center_word, word2Ind, V)

In [41]:
# Print vectors associated to center and context words for corpus using the generator function
for context_words_vector, center_word_vector in get_training_example(words, 2, word2Ind, n):
    print(f'Context words vector:  {context_words_vector}')
    print(f'Center word vector:  {center_word_vector}')
    print()

Context words vector:  [0.   0.25 0.25 0.   0.5  0.  ]
Center word vector:  [0. 0. 0. 1. 0. 0.]

Context words vector:  [0.   0.5  0.   0.25 0.25 0.  ]
Center word vector:  [0. 0. 1. 0. 0. 0.]

Context words vector:  [0.   0.25 0.25 0.25 0.   0.25]
Center word vector:  [0. 0. 0. 0. 1. 0.]

Context words vector:  [0.25 0.   0.25 0.   0.25 0.25]
Center word vector:  [0. 1. 0. 0. 0. 0.]

Context words vector:  [0.33333333 0.33333333 0.         0.         0.33333333 0.        ]
Center word vector:  [0. 0. 0. 0. 0. 1.]

